1164 lines
103 KiB
Plaintext
1164 lines
103 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "2f8e19e4",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Lineare Regression mit mehreren Features ($d>1$)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "643861b2",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"# plotting settings\n",
|
||
"pd.plotting.register_matplotlib_converters()\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"%matplotlib inline\n",
|
||
"import seaborn as sns\n",
|
||
"from tqdm.notebook import tqdm"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "315bd31f",
|
||
"metadata": {},
|
||
"source": [
|
||
"Wir verwenden hier beispielhaft den Datensatz [Melbourne Housing Snapshot](https://www.kaggle.com/datasets/dansbecker/melbourne-housing-snapshot). Diesen finden Sie auch im Moodle unter `data/kaggle/melb_data.csv`."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "e3381ac0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',\n",
|
||
" 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',\n",
|
||
" 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',\n",
|
||
" 'Longtitude', 'Regionname', 'Propertycount'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"melbourne_file_path = 'data/melb_data.csv'\n",
|
||
"melbourne_data = pd.read_csv(melbourne_file_path)\n",
|
||
"melbourne_data = melbourne_data.dropna(axis=0) # entfernen von Daten mit fehlenden Werten\n",
|
||
"melbourne_data.columns # Spaltennamen der Tabelle (potentielle Features)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "0f80237c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Suburb</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Rooms</th>\n",
|
||
" <th>Type</th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>Method</th>\n",
|
||
" <th>SellerG</th>\n",
|
||
" <th>Date</th>\n",
|
||
" <th>Distance</th>\n",
|
||
" <th>Postcode</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Bathroom</th>\n",
|
||
" <th>Car</th>\n",
|
||
" <th>Landsize</th>\n",
|
||
" <th>BuildingArea</th>\n",
|
||
" <th>YearBuilt</th>\n",
|
||
" <th>CouncilArea</th>\n",
|
||
" <th>Lattitude</th>\n",
|
||
" <th>Longtitude</th>\n",
|
||
" <th>Regionname</th>\n",
|
||
" <th>Propertycount</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Abbotsford</td>\n",
|
||
" <td>25 Bloomburg St</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>h</td>\n",
|
||
" <td>1035000.0</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Biggin</td>\n",
|
||
" <td>4/02/2016</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>3067.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>156.0</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>1900.0</td>\n",
|
||
" <td>Yarra</td>\n",
|
||
" <td>-37.8079</td>\n",
|
||
" <td>144.9934</td>\n",
|
||
" <td>Northern Metropolitan</td>\n",
|
||
" <td>4019.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Abbotsford</td>\n",
|
||
" <td>5 Charles St</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>h</td>\n",
|
||
" <td>1465000.0</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>Biggin</td>\n",
|
||
" <td>4/03/2017</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>3067.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>134.0</td>\n",
|
||
" <td>150.0</td>\n",
|
||
" <td>1900.0</td>\n",
|
||
" <td>Yarra</td>\n",
|
||
" <td>-37.8093</td>\n",
|
||
" <td>144.9944</td>\n",
|
||
" <td>Northern Metropolitan</td>\n",
|
||
" <td>4019.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Abbotsford</td>\n",
|
||
" <td>55a Park St</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>h</td>\n",
|
||
" <td>1600000.0</td>\n",
|
||
" <td>VB</td>\n",
|
||
" <td>Nelson</td>\n",
|
||
" <td>4/06/2016</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>3067.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>120.0</td>\n",
|
||
" <td>142.0</td>\n",
|
||
" <td>2014.0</td>\n",
|
||
" <td>Yarra</td>\n",
|
||
" <td>-37.8072</td>\n",
|
||
" <td>144.9941</td>\n",
|
||
" <td>Northern Metropolitan</td>\n",
|
||
" <td>4019.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Abbotsford</td>\n",
|
||
" <td>124 Yarra St</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>h</td>\n",
|
||
" <td>1876000.0</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Nelson</td>\n",
|
||
" <td>7/05/2016</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>3067.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>245.0</td>\n",
|
||
" <td>210.0</td>\n",
|
||
" <td>1910.0</td>\n",
|
||
" <td>Yarra</td>\n",
|
||
" <td>-37.8024</td>\n",
|
||
" <td>144.9993</td>\n",
|
||
" <td>Northern Metropolitan</td>\n",
|
||
" <td>4019.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Abbotsford</td>\n",
|
||
" <td>98 Charles St</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>h</td>\n",
|
||
" <td>1636000.0</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Nelson</td>\n",
|
||
" <td>8/10/2016</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>3067.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>256.0</td>\n",
|
||
" <td>107.0</td>\n",
|
||
" <td>1890.0</td>\n",
|
||
" <td>Yarra</td>\n",
|
||
" <td>-37.8060</td>\n",
|
||
" <td>144.9954</td>\n",
|
||
" <td>Northern Metropolitan</td>\n",
|
||
" <td>4019.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Suburb Address Rooms Type Price Method SellerG \\\n",
|
||
"1 Abbotsford 25 Bloomburg St 2 h 1035000.0 S Biggin \n",
|
||
"2 Abbotsford 5 Charles St 3 h 1465000.0 SP Biggin \n",
|
||
"4 Abbotsford 55a Park St 4 h 1600000.0 VB Nelson \n",
|
||
"6 Abbotsford 124 Yarra St 3 h 1876000.0 S Nelson \n",
|
||
"7 Abbotsford 98 Charles St 2 h 1636000.0 S Nelson \n",
|
||
"\n",
|
||
" Date Distance Postcode ... Bathroom Car Landsize BuildingArea \\\n",
|
||
"1 4/02/2016 2.5 3067.0 ... 1.0 0.0 156.0 79.0 \n",
|
||
"2 4/03/2017 2.5 3067.0 ... 2.0 0.0 134.0 150.0 \n",
|
||
"4 4/06/2016 2.5 3067.0 ... 1.0 2.0 120.0 142.0 \n",
|
||
"6 7/05/2016 2.5 3067.0 ... 2.0 0.0 245.0 210.0 \n",
|
||
"7 8/10/2016 2.5 3067.0 ... 1.0 2.0 256.0 107.0 \n",
|
||
"\n",
|
||
" YearBuilt CouncilArea Lattitude Longtitude Regionname \\\n",
|
||
"1 1900.0 Yarra -37.8079 144.9934 Northern Metropolitan \n",
|
||
"2 1900.0 Yarra -37.8093 144.9944 Northern Metropolitan \n",
|
||
"4 2014.0 Yarra -37.8072 144.9941 Northern Metropolitan \n",
|
||
"6 1910.0 Yarra -37.8024 144.9993 Northern Metropolitan \n",
|
||
"7 1890.0 Yarra -37.8060 144.9954 Northern Metropolitan \n",
|
||
"\n",
|
||
" Propertycount \n",
|
||
"1 4019.0 \n",
|
||
"2 4019.0 \n",
|
||
"4 4019.0 \n",
|
||
"6 4019.0 \n",
|
||
"7 4019.0 \n",
|
||
"\n",
|
||
"[5 rows x 21 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"melbourne_data.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "b4939e52",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#features = ['BuildingArea', Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'YearBuilt', 'Distance']\n",
|
||
"features = ['Rooms', 'BuildingArea']\n",
|
||
"data = melbourne_data[features + ['Price']]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "47f35849",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Rooms</th>\n",
|
||
" <th>BuildingArea</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>6196.000000</td>\n",
|
||
" <td>6196.000000</td>\n",
|
||
" <td>6.196000e+03</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>2.931407</td>\n",
|
||
" <td>141.568645</td>\n",
|
||
" <td>1.068828e+06</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>0.971079</td>\n",
|
||
" <td>90.834824</td>\n",
|
||
" <td>6.751564e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.310000e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>91.000000</td>\n",
|
||
" <td>6.200000e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>3.000000</td>\n",
|
||
" <td>124.000000</td>\n",
|
||
" <td>8.800000e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>4.000000</td>\n",
|
||
" <td>170.000000</td>\n",
|
||
" <td>1.325000e+06</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>8.000000</td>\n",
|
||
" <td>3112.000000</td>\n",
|
||
" <td>9.000000e+06</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Rooms BuildingArea Price\n",
|
||
"count 6196.000000 6196.000000 6.196000e+03\n",
|
||
"mean 2.931407 141.568645 1.068828e+06\n",
|
||
"std 0.971079 90.834824 6.751564e+05\n",
|
||
"min 1.000000 0.000000 1.310000e+05\n",
|
||
"25% 2.000000 91.000000 6.200000e+05\n",
|
||
"50% 3.000000 124.000000 8.800000e+05\n",
|
||
"75% 4.000000 170.000000 1.325000e+06\n",
|
||
"max 8.000000 3112.000000 9.000000e+06"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "ed0fdea0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Rooms</th>\n",
|
||
" <th>BuildingArea</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>1035000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>150.0</td>\n",
|
||
" <td>1465000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>142.0</td>\n",
|
||
" <td>1600000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>210.0</td>\n",
|
||
" <td>1876000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>107.0</td>\n",
|
||
" <td>1636000.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Rooms BuildingArea Price\n",
|
||
"1 2 79.0 1035000.0\n",
|
||
"2 3 150.0 1465000.0\n",
|
||
"4 4 142.0 1600000.0\n",
|
||
"6 3 210.0 1876000.0\n",
|
||
"7 2 107.0 1636000.0"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "b5126919",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Definition der Funktionen für die Lineare Regression"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "dacabf66",
|
||
"metadata": {},
|
||
"source": [
|
||
"Aus der Vorlesung:\n",
|
||
"\n",
|
||
"$$ h(x, w) = w^T x . $$\n",
|
||
"\n",
|
||
"In der Vorlesung haben wir $\\theta$ statt $w$ verwendet.\n",
|
||
"\n",
|
||
"**Wichtig:** Diese Definition von $h$ nimmt an, dass die erste Komponente von $x$, also in Python code `x[0]`, immer 1 ist.\n",
|
||
"\n",
|
||
"Wir können auch eine vektorisierte Form von $h(x, w)$ definieren, bei der der Input $X$ mehrere (oder alle) Trainingsbeispiele umfasst und der Output ein Vektor aus den zugehörigen Werten von h zu jedem der Trainingsbeispiele ist. In Matrixschreibweise:\n",
|
||
"\n",
|
||
"$$ h(X, w) = X w , $$\n",
|
||
"\n",
|
||
"wobei die Zeilen von $X$ aus je einem Trainingsbeispiel (inkl. der \"1\" in der ersten Komponente) bestehen.\n",
|
||
"\n",
|
||
"Aufgrund der Art wie `numpy` den Spezialfall der Multiplikation zweier Vektoren handhabt können wir den Code für beide oben erwähnten Varianten von $h$ vereinheitlichen und eine Funktion $h(x, w)$ definieren, die sowohl mit einer Inputzeile als auch mit mehreren Inputzeilen umgehen kann.\n",
|
||
"\n",
|
||
"Bei der Multiplikation zweier numpy arrays (also zweier Vektoren) mittels `@`-Operator bildet numpy stets das Skalarprodukt der Vektoren, ohne dass man einen der Vektoren transponieren müsste. D.h., wenn wir zwei Spaltenvektoren $w, x$ haben, lautet die korrekte Schreibweise eigentlich:\n",
|
||
"$$w^T x$$\n",
|
||
"numpy erlaubt es uns aber einfach `w @ x` oder auch `x @ w` zu schreiben anstelle (des ebenfalls möglichen) `w.T @ x`.\n",
|
||
"\n",
|
||
"Dies ermöglicht es uns eine vektorisierte Form von $h(x, w)$ leicht aufzuschreiben, die sowohl mit einem Parameter `x` bestehend aus einer Zeile an Inputdaten (also z.B. einem einzelnen Trainingsbeispiel) funktioniert als auch mit der gesamten Feature-Matrix `X`, bestehend aus allen (oder mehreren) Trainingsdaten auf einmal."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "14116a52",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def h(x, w):\n",
|
||
" \"\"\"x und w sind numpy arrays; x kann auch die komplette Feature-Matrix sein\"\"\"\n",
|
||
" # Diese Form erlaubt es für x eine ganze (Feature-)Matrix zu übergeben. Die Matrix enthält\n",
|
||
" # zeilenweise je einen Datenpunkt, für den h berechnet werden soll.\n",
|
||
" # w @ x.T ist dann ein Vektor mit je einem Ergebnis in den Komponenten des Vektors pro Zeile\n",
|
||
" # der übergebenen (Feature-)Matrix.\n",
|
||
" return x @ w"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "82129a25",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Definition der Kostenfunktion\n",
|
||
"def J(w, X, y):\n",
|
||
" \"\"\"\n",
|
||
" w, X, y müssen numpy arrays sein\n",
|
||
" X: Feature-Matrix aller Trainingsdaten inkl. Spalte mit 1; Dimension: n x (d+1)\n",
|
||
" y: Vektor aller Targets zu X\n",
|
||
" \"\"\"\n",
|
||
" errors = y - h(x=X, w=w)\n",
|
||
" mse = 1.0/(2.0*len(y)) * ( errors @ errors )\n",
|
||
" return mse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "4209dc9c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(6196, 3)"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "8b34a5c7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([[1.],\n",
|
||
" [1.],\n",
|
||
" [1.],\n",
|
||
" ...,\n",
|
||
" [1.],\n",
|
||
" [1.],\n",
|
||
" [1.]])"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"np.ones((len(data),1))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "6632cabe",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def feature_matrix_from_data(data):\n",
|
||
" # hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n",
|
||
" return np.hstack((np.ones((len(data),1)), data.to_numpy(copy=True)))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "74556a69",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n",
|
||
"#X = np.hstack((np.ones((len(data),1)), data[features].to_numpy(copy=True)))\n",
|
||
"X = feature_matrix_from_data(data[features])\n",
|
||
"# und ausserdem den Vektor der Targets\n",
|
||
"y = data.Price.to_numpy(copy=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "79f5e3e0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(6196, 3)"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "8f9724c3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([[ 1. , 2. , 79. ],\n",
|
||
" [ 1. , 3. , 150. ],\n",
|
||
" [ 1. , 4. , 142. ],\n",
|
||
" ...,\n",
|
||
" [ 1. , 1. , 35.64],\n",
|
||
" [ 1. , 2. , 61.6 ],\n",
|
||
" [ 1. , 6. , 388.5 ]])"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "85733d6b",
|
||
"metadata": {},
|
||
"source": [
|
||
"**Hinweis:** Die Matrix $X$ hat zwar die gleiche Dimension wie `data`, allerdings enthält data eine Spalte `Price`, die in $X$ nicht enthalten ist. Dafür hat $X$ als erste Spalte die \"1er\"."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "1d8e64e6",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Analytische Lösung der linearen Regression\n",
|
||
"\n",
|
||
"Die analytische Lösung verläuft identisch zum Fall mit nur einem Feature.\n",
|
||
"\n",
|
||
"`np.linalg.solve(A, b)` berechnet $w$ im linearen Gleichungssystem\n",
|
||
"\n",
|
||
"$ A w = b $\n",
|
||
"\n",
|
||
"$A$ - Matrix,\n",
|
||
"$w$ - Vektor (unsere unbekannten),\n",
|
||
"$b$ - Vektor.\n",
|
||
"\n",
|
||
"Wir suchen die Lösung $w$ im folgenden Gleichungssystem:\n",
|
||
"\n",
|
||
"$$ X^T X w = X^T Y $$\n",
|
||
"\n",
|
||
"Mit $A = X^TX$ und $b = X^T Y$ berechnet `np.linalg.solve(A, b)` unsere gesuchten Paramter für die lineare Regression."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "fc1d2c0a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Die 3 Parameter der linearen Regression:\n",
|
||
"[ 42769.88494072 232612.86504788 2431.15453776]\n",
|
||
"Kostenfunktion J(w_ana): 147658829426.14856\n",
|
||
"CPU times: user 11.3 ms, sys: 2.13 ms, total: 13.4 ms\n",
|
||
"Wall time: 1.7 ms\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%%time\n",
|
||
"w_ana = np.linalg.solve(X.T @ X, X.T @ y)\n",
|
||
"print('Die {} Parameter der linearen Regression:\\n{}'.format(len(w_ana), w_ana))\n",
|
||
"J_ana = J(w=w_ana, X=X, y=y)\n",
|
||
"print('Kostenfunktion J(w_ana): {}'.format(J_ana))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "daab4572",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Numerische Lösung mit Gradient Descent"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "b314f36a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"## Numerische Lösung mit Gradient Descent\n",
|
||
"def grad_desc_upd(w, alpha, x, y):\n",
|
||
" \"\"\"y, x sind Vektoren (numpy-arrays)\"\"\"\n",
|
||
" errors = y - h(x=x, w=w)\n",
|
||
" w = w + alpha / len(y) * (x.T @ errors)\n",
|
||
" return w"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "3dc2775c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def grad_desc(w, alpha, x, y, n_iterations):\n",
|
||
" J_all = [[0], [J(w=w, X=x, y=y)]]\n",
|
||
" for it in tqdm(range(n_iterations)):\n",
|
||
" w = grad_desc_upd(w=w, alpha=alpha, x=x, y=y)\n",
|
||
" if it % 100 == 0:\n",
|
||
" J_all[1].append(J(w=w, X=x, y=y))\n",
|
||
" J_all[0].append(it)\n",
|
||
" return w, J_all"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "a801cac3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([ 2.43686014, 5.00088371, 206.19316114])"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"grad_desc_upd(w=np.ones(X.shape[1]), alpha=1e-6, x=X[:7], y=y[:7])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "dc6f778a",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "e9c9403f9b08472294edb52bb2c10c1d",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/10000 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"CPU times: user 1.94 s, sys: 1.87 s, total: 3.81 s\n",
|
||
"Wall time: 417 ms\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%%time\n",
|
||
"w_init = np.ones(X.shape[1])\n",
|
||
"alpha = 3.1e-10 # verschiedene alpha ausprobieren\n",
|
||
"n_iterations = 10000\n",
|
||
"_, J_tmp = grad_desc(w=w_init, alpha=alpha, x=X, y=y, n_iterations=n_iterations)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "c04ebb9f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "57bac370a44f48e9951dc5dba56b29ef",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/100000 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Die 3 Parameter der linearen Regression:\n",
|
||
"[ 10.49970832 31.89447067 1601.95876825]\n",
|
||
"Kostenfunktion J: 540959857400.77966\n",
|
||
"J relativ zu Startkosten: 0.6771395257663181\n",
|
||
"Vergleich Kostenfunktion zu analytischer Lösung: 3.66*J_ana\n",
|
||
"Relative Abweichung der Parameter zu analytischer Lösung: [2.45493022e-04 1.37113958e-04 6.58929222e-01]*w_ana\n",
|
||
"CPU times: user 21.5 s, sys: 11 s, total: 32.6 s\n",
|
||
"Wall time: 3.53 s\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%%time\n",
|
||
"w_init = np.ones(X.shape[1])\n",
|
||
"alpha = 1e-10 # verschiedene alpha ausprobieren\n",
|
||
"n_iterations = 100000\n",
|
||
"w_gd, J_all = grad_desc(w=w_init, alpha=alpha, x=X, y=y, n_iterations=n_iterations)\n",
|
||
"print('Die {} Parameter der linearen Regression:\\n{}'.format(len(w_gd), w_gd))\n",
|
||
"print('Kostenfunktion J: {}'.format(J_all[1][-1]))\n",
|
||
"print('J relativ zu Startkosten: {}'.format(J_all[1][-1]/J_all[1][0]))\n",
|
||
"print('Vergleich Kostenfunktion zu analytischer Lösung: {:.2f}*J_ana'.format(J_all[1][-1]/J_ana))\n",
|
||
"print('Relative Abweichung der Parameter zu analytischer Lösung: {}*w_ana'.format((w_gd)/w_ana))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "8b4db3ee",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"sns.lineplot(x=J_all[0], y=J_all[1])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "31574b9a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "16a65185cb4541ffbdb5a02b33027697",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/1000000 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Die 3 Parameter der linearen Regression:\n",
|
||
"[ 81.08915614 248.45370379 6493.32860783]\n",
|
||
"Kostenfunktion J: 201611248738.63248\n",
|
||
"J relativ zu Startkosten: 0.37282268754553793\n",
|
||
"Vergleich Kostenfunktion zu analytischer Lösung: 1.36539*J_ana\n",
|
||
"Relative Abweichung der Parameter zu analytischer Lösung: [1.89594048e-03 1.06809958e-03 2.67088270e+00]*w_ana\n",
|
||
"CPU times: user 3min 26s, sys: 1min 26s, total: 4min 53s\n",
|
||
"Wall time: 31.6 s\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%%time\n",
|
||
"alpha = 3.1e-10 # verschiedene alpha ausprobieren\n",
|
||
"n_iterations = 1000000\n",
|
||
"w_gd2, J_all2 = grad_desc(w=w_gd, alpha=alpha, x=X, y=y, n_iterations=n_iterations)\n",
|
||
"print('Die {} Parameter der linearen Regression:\\n{}'.format(len(w_gd2), w_gd2))\n",
|
||
"print('Kostenfunktion J: {}'.format(J_all2[1][-1]))\n",
|
||
"print('J relativ zu Startkosten: {}'.format(J_all2[1][-1]/J_all2[1][0]))\n",
|
||
"print('Vergleich Kostenfunktion zu analytischer Lösung: {:.5f}*J_ana'.format(J_all2[1][-1]/J_ana))\n",
|
||
"print('Relative Abweichung der Parameter zu analytischer Lösung: {}*w_ana'.format((w_gd2)/w_ana))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "4434e050",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"sns.lineplot(x=J_all2[0], y=J_all2[1])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "4d0fbfee",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "d680d4cc18984adab5920af97da76e2e",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/10000000 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Die 3 Parameter der linearen Regression:\n",
|
||
"[ 540.5020477 1598.80804052 6469.41806027]\n",
|
||
"Kostenfunktion J: 200954758401.09796\n",
|
||
"J relativ zu Startkosten: 0.9967438136028319\n",
|
||
"Vergleich Kostenfunktion zu analytischer Lösung: 1.36*J_ana\n",
|
||
"Relative Abweichung der Parameter zu analytischer Lösung: [0.01263744 0.00687326 2.66104765]*w_ana\n",
|
||
"CPU times: user 37min 33s, sys: 9min 27s, total: 47min 1s\n",
|
||
"Wall time: 5min 1s\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%%time\n",
|
||
"alpha = 3.1e-10 # verschiedene alpha ausprobieren\n",
|
||
"n_iterations = 10000000\n",
|
||
"w_gd3, J_all3 = grad_desc(w=w_gd2, alpha=alpha, x=X, y=y, n_iterations=n_iterations)\n",
|
||
"\n",
|
||
"print('Die {} Parameter der linearen Regression:\\n{}'.format(len(w_gd3), w_gd3))\n",
|
||
"print('Kostenfunktion J: {}'.format(J_all3[1][-1]))\n",
|
||
"print('J relativ zu Startkosten: {}'.format(J_all3[1][-1]/J_all3[1][0]))\n",
|
||
"print('Vergleich Kostenfunktion zu analytischer Lösung: {:.2f}*J_ana'.format(J_all3[1][-1]/J_ana))\n",
|
||
"print('Relative Abweichung der Parameter zu analytischer Lösung: {}*w_ana'.format((w_gd3)/w_ana))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"id": "252656f1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"sns.lineplot(x=J_all3[0], y=J_all3[1])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e8b1f648",
|
||
"metadata": {},
|
||
"source": [
|
||
"## $R^2$"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "50022cc2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"erklärte Varianz (R^2): 0.3520362618371272\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"X = feature_matrix_from_data(data[features])\n",
|
||
"y = data.Price.to_numpy(copy=True)\n",
|
||
"J_ana = J(w=w_ana, X=X, y=y)\n",
|
||
"MSE = 2*J_ana\n",
|
||
"mu_y = sum(y)/len(y)\n",
|
||
"sigma_y_quadrat = ( (y - mu_y) @ (y - mu_y) ) / len(y)\n",
|
||
"R2 = 1 - MSE/sigma_y_quadrat\n",
|
||
"print('erklärte Varianz (R^2): {}'.format(R2))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "104ad3d4",
|
||
"metadata": {},
|
||
"source": [
|
||
"$R^2$ ist größer als beim Modell mit nur 1 Feature (BuildingArea)."
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.4"
|
||
},
|
||
"varInspector": {
|
||
"cols": {
|
||
"lenName": 16,
|
||
"lenType": 16,
|
||
"lenVar": 40
|
||
},
|
||
"kernels_config": {
|
||
"python": {
|
||
"delete_cmd_postfix": "",
|
||
"delete_cmd_prefix": "del ",
|
||
"library": "var_list.py",
|
||
"varRefreshCmd": "print(var_dic_list())"
|
||
},
|
||
"r": {
|
||
"delete_cmd_postfix": ") ",
|
||
"delete_cmd_prefix": "rm(",
|
||
"library": "var_list.r",
|
||
"varRefreshCmd": "cat(var_dic_list()) "
|
||
}
|
||
},
|
||
"types_to_exclude": [
|
||
"module",
|
||
"function",
|
||
"builtin_function_or_method",
|
||
"instance",
|
||
"_Feature"
|
||
],
|
||
"window_display": false
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|