{ "cells": [ { "cell_type": "markdown", "id": "dff037b7-7d71-49c2-8a47-48017c073f81", "metadata": {}, "source": [ "$R^2$ für ein polynomielles Modell" ] }, { "cell_type": "code", "execution_count": 51, "id": "8b1ff6ff-f80e-4cc3-b266-0ad417911d1d", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "# plotting settings\n", "pd.plotting.register_matplotlib_converters()\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "import seaborn as sns\n", "from tqdm.notebook import tqdm" ] }, { "cell_type": "code", "execution_count": 52, "id": "f698283d-7346-4618-9b87-60a3de061a98", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',\n", " 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',\n", " 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',\n", " 'Longtitude', 'Regionname', 'Propertycount'],\n", " dtype='object')" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "melbourne_file_path = 'data/melb_data.csv'\n", "melbourne_data = pd.read_csv(melbourne_file_path)\n", "melbourne_data = melbourne_data.dropna(axis=0) # entfernen von Daten mit fehlenden Werten\n", "melbourne_data.columns # Spaltennamen der Tabelle (potentielle Features)\n" ] }, { "cell_type": "code", "execution_count": 53, "id": "dac65c52-f2ce-47b6-ba65-3c6bd915dfe8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Rooms</th>\n", " <th>BuildingArea</th>\n", " <th>Lattitude</th>\n", " <th>Price</th>\n", " <th>price_per_area</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>79.0</td>\n", " <td>-37.8079</td>\n", " <td>1035000.0</td>\n", " <td>161460000.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>150.0</td>\n", " <td>-37.8093</td>\n", " <td>1465000.0</td>\n", " <td>196310000.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>142.0</td>\n", " <td>-37.8072</td>\n", " <td>1600000.0</td>\n", " <td>192000000.0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>3</td>\n", " <td>210.0</td>\n", " <td>-37.8024</td>\n", " <td>1876000.0</td>\n", " <td>459620000.0</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>2</td>\n", " <td>107.0</td>\n", " <td>-37.8060</td>\n", " <td>1636000.0</td>\n", " <td>418816000.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Rooms BuildingArea Lattitude Price price_per_area\n", "1 2 79.0 -37.8079 1035000.0 161460000.0\n", "2 3 150.0 -37.8093 1465000.0 196310000.0\n", "4 4 142.0 -37.8072 1600000.0 192000000.0\n", "6 3 210.0 -37.8024 1876000.0 459620000.0\n", "7 2 107.0 -37.8060 1636000.0 418816000.0" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#features = ['BuildingArea', Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'YearBuilt', 'Distance']\n", "features = ['Rooms', 'BuildingArea', 'Lattitude']\n", "data = melbourne_data[features + ['Price']]\n", "data = data.assign(price_per_area = melbourne_data['Price'] * melbourne_data[\"Landsize\"])\n", "data.describe()\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 54, "id": "afe5a08a-abec-4164-85c4-1d3ac8398a62", "metadata": {}, "outputs": [], "source": [ "def h(x, w):\n", " \"\"\"x und w sind numpy arrays; x kann auch die komplette Feature-Matrix sein\"\"\"\n", " # Diese Form erlaubt es für x eine ganze (Feature-)Matrix zu übergeben. Die Matrix enthält\n", " # zeilenweise je einen Datenpunkt, für den h berechnet werden soll.\n", " # w @ x.T ist dann ein Vektor mit je einem Ergebnis in den Komponenten des Vektors pro Zeile\n", " # der übergebenen (Feature-)Matrix.\n", " return x @ w\n" ] }, { "cell_type": "code", "execution_count": 55, "id": "c2e256a3-3575-45c4-a99c-d41c3c56e1c3", "metadata": {}, "outputs": [], "source": [ "# Definition der Kostenfunktion\n", "def J(w, X, y):\n", " \"\"\"\n", " w, X, y müssen numpy arrays sein\n", " X: Feature-Matrix aller Trainingsdaten inkl. Spalte mit 1; Dimension: n x (d+1)\n", " y: Vektor aller Targets zu X\n", " \"\"\"\n", " errors = y - h(x=X, w=w)\n", " mse = 1.0/(2.0*len(y)) * ( errors @ errors )\n", " return mse\n" ] }, { "cell_type": "code", "execution_count": 56, "id": "b41b9c03-0c1d-4a6b-80e8-d7e8775b69c0", "metadata": {}, "outputs": [], "source": [ "def feature_matrix_from_data(data):\n", " # hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n", " return np.hstack((np.ones((len(data),1)), data.to_numpy(copy=True)))\n" ] }, { "cell_type": "code", "execution_count": 57, "id": "2c631e17-eb36-43d0-97b1-59add1c93dd9", "metadata": {}, "outputs": [], "source": [ "# hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n", "#X = np.hstack((np.ones((len(data),1)), data[features].to_numpy(copy=True)))\n", "X = feature_matrix_from_data(data[features])\n", "# und ausserdem den Vektor der Targets\n", "y = data.Price.to_numpy(copy=True)\n" ] }, { "cell_type": "code", "execution_count": 58, "id": "8301eb74-9aae-446c-ad46-924811b99777", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Die 4 Parameter der linearen Regression:\n", "[-6.97461781e+07 2.41559504e+05 2.31456611e+03 -1.84562537e+06]\n", "Kostenfunktion J(w_ana): 137899453867.5851\n", "CPU times: user 475 μs, sys: 38 μs, total: 513 μs\n", "Wall time: 500 μs\n" ] } ], "source": [ "%%time\n", "w_ana = np.linalg.solve(X.T @ X, X.T @ y)\n", "print('Die {} Parameter der linearen Regression:\\n{}'.format(len(w_ana), w_ana))\n", "J_ana = J(w=w_ana, X=X, y=y)\n", "print('Kostenfunktion J(w_ana): {}'.format(J_ana))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python (ki)", "language": "python", "name": "myenv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 5 }