ki-dhbw/tasks/Untitled.ipynb

268 lines
8 KiB
Text
Raw Normal View History

2025-01-23 10:44:22 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "dff037b7-7d71-49c2-8a47-48017c073f81",
"metadata": {},
"source": [
"$R^2$ für ein polynomielles Modell"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "8b1ff6ff-f80e-4cc3-b266-0ad417911d1d",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"# plotting settings\n",
"pd.plotting.register_matplotlib_converters()\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"from tqdm.notebook import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "f698283d-7346-4618-9b87-60a3de061a98",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',\n",
" 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',\n",
" 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',\n",
" 'Longtitude', 'Regionname', 'Propertycount'],\n",
" dtype='object')"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"melbourne_file_path = 'data/melb_data.csv'\n",
"melbourne_data = pd.read_csv(melbourne_file_path)\n",
"melbourne_data = melbourne_data.dropna(axis=0) # entfernen von Daten mit fehlenden Werten\n",
"melbourne_data.columns # Spaltennamen der Tabelle (potentielle Features)\n"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "dac65c52-f2ce-47b6-ba65-3c6bd915dfe8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rooms</th>\n",
" <th>BuildingArea</th>\n",
" <th>Lattitude</th>\n",
" <th>Price</th>\n",
" <th>price_per_area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>79.0</td>\n",
" <td>-37.8079</td>\n",
" <td>1035000.0</td>\n",
" <td>161460000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>150.0</td>\n",
" <td>-37.8093</td>\n",
" <td>1465000.0</td>\n",
" <td>196310000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>142.0</td>\n",
" <td>-37.8072</td>\n",
" <td>1600000.0</td>\n",
" <td>192000000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>210.0</td>\n",
" <td>-37.8024</td>\n",
" <td>1876000.0</td>\n",
" <td>459620000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2</td>\n",
" <td>107.0</td>\n",
" <td>-37.8060</td>\n",
" <td>1636000.0</td>\n",
" <td>418816000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rooms BuildingArea Lattitude Price price_per_area\n",
"1 2 79.0 -37.8079 1035000.0 161460000.0\n",
"2 3 150.0 -37.8093 1465000.0 196310000.0\n",
"4 4 142.0 -37.8072 1600000.0 192000000.0\n",
"6 3 210.0 -37.8024 1876000.0 459620000.0\n",
"7 2 107.0 -37.8060 1636000.0 418816000.0"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#features = ['BuildingArea', Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'YearBuilt', 'Distance']\n",
"features = ['Rooms', 'BuildingArea', 'Lattitude']\n",
"data = melbourne_data[features + ['Price']]\n",
"data = data.assign(price_per_area = melbourne_data['Price'] * melbourne_data[\"Landsize\"])\n",
"data.describe()\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "afe5a08a-abec-4164-85c4-1d3ac8398a62",
"metadata": {},
"outputs": [],
"source": [
"def h(x, w):\n",
" \"\"\"x und w sind numpy arrays; x kann auch die komplette Feature-Matrix sein\"\"\"\n",
" # Diese Form erlaubt es für x eine ganze (Feature-)Matrix zu übergeben. Die Matrix enthält\n",
" # zeilenweise je einen Datenpunkt, für den h berechnet werden soll.\n",
" # w @ x.T ist dann ein Vektor mit je einem Ergebnis in den Komponenten des Vektors pro Zeile\n",
" # der übergebenen (Feature-)Matrix.\n",
" return x @ w\n"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "c2e256a3-3575-45c4-a99c-d41c3c56e1c3",
"metadata": {},
"outputs": [],
"source": [
"# Definition der Kostenfunktion\n",
"def J(w, X, y):\n",
" \"\"\"\n",
" w, X, y müssen numpy arrays sein\n",
" X: Feature-Matrix aller Trainingsdaten inkl. Spalte mit 1; Dimension: n x (d+1)\n",
" y: Vektor aller Targets zu X\n",
" \"\"\"\n",
" errors = y - h(x=X, w=w)\n",
" mse = 1.0/(2.0*len(y)) * ( errors @ errors )\n",
" return mse\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "b41b9c03-0c1d-4a6b-80e8-d7e8775b69c0",
"metadata": {},
"outputs": [],
"source": [
"def feature_matrix_from_data(data):\n",
" # hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n",
" return np.hstack((np.ones((len(data),1)), data.to_numpy(copy=True)))\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "2c631e17-eb36-43d0-97b1-59add1c93dd9",
"metadata": {},
"outputs": [],
"source": [
"# hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n",
"#X = np.hstack((np.ones((len(data),1)), data[features].to_numpy(copy=True)))\n",
"X = feature_matrix_from_data(data[features])\n",
"# und ausserdem den Vektor der Targets\n",
"y = data.Price.to_numpy(copy=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "8301eb74-9aae-446c-ad46-924811b99777",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Die 4 Parameter der linearen Regression:\n",
"[-6.97461781e+07 2.41559504e+05 2.31456611e+03 -1.84562537e+06]\n",
"Kostenfunktion J(w_ana): 137899453867.5851\n",
"CPU times: user 475 μs, sys: 38 μs, total: 513 μs\n",
"Wall time: 500 μs\n"
]
}
],
"source": [
"%%time\n",
"w_ana = np.linalg.solve(X.T @ X, X.T @ y)\n",
"print('Die {} Parameter der linearen Regression:\\n{}'.format(len(w_ana), w_ana))\n",
"J_ana = J(w=w_ana, X=X, y=y)\n",
"print('Kostenfunktion J(w_ana): {}'.format(J_ana))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (ki)",
"language": "python",
"name": "myenv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}