{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dff037b7-7d71-49c2-8a47-48017c073f81",
   "metadata": {},
   "source": [
    "$R^2$ für ein polynomielles Modell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "8b1ff6ff-f80e-4cc3-b266-0ad417911d1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "# plotting settings\n",
    "pd.plotting.register_matplotlib_converters()\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "from tqdm.notebook import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "f698283d-7346-4618-9b87-60a3de061a98",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',\n",
       "       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',\n",
       "       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',\n",
       "       'Longtitude', 'Regionname', 'Propertycount'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "melbourne_file_path = 'data/melb_data.csv'\n",
    "melbourne_data = pd.read_csv(melbourne_file_path)\n",
    "melbourne_data = melbourne_data.dropna(axis=0)  # entfernen von Daten mit fehlenden Werten\n",
    "melbourne_data.columns  # Spaltennamen der Tabelle (potentielle Features)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "dac65c52-f2ce-47b6-ba65-3c6bd915dfe8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rooms</th>\n",
       "      <th>BuildingArea</th>\n",
       "      <th>Lattitude</th>\n",
       "      <th>Price</th>\n",
       "      <th>price_per_area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>79.0</td>\n",
       "      <td>-37.8079</td>\n",
       "      <td>1035000.0</td>\n",
       "      <td>161460000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>150.0</td>\n",
       "      <td>-37.8093</td>\n",
       "      <td>1465000.0</td>\n",
       "      <td>196310000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>142.0</td>\n",
       "      <td>-37.8072</td>\n",
       "      <td>1600000.0</td>\n",
       "      <td>192000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3</td>\n",
       "      <td>210.0</td>\n",
       "      <td>-37.8024</td>\n",
       "      <td>1876000.0</td>\n",
       "      <td>459620000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2</td>\n",
       "      <td>107.0</td>\n",
       "      <td>-37.8060</td>\n",
       "      <td>1636000.0</td>\n",
       "      <td>418816000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rooms  BuildingArea  Lattitude      Price  price_per_area\n",
       "1      2          79.0   -37.8079  1035000.0     161460000.0\n",
       "2      3         150.0   -37.8093  1465000.0     196310000.0\n",
       "4      4         142.0   -37.8072  1600000.0     192000000.0\n",
       "6      3         210.0   -37.8024  1876000.0     459620000.0\n",
       "7      2         107.0   -37.8060  1636000.0     418816000.0"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#features = ['BuildingArea', Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'YearBuilt', 'Distance']\n",
    "features = ['Rooms', 'BuildingArea', 'Lattitude']\n",
    "data = melbourne_data[features + ['Price']]\n",
    "data = data.assign(price_per_area = melbourne_data['Price'] * melbourne_data[\"Landsize\"])\n",
    "data.describe()\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "afe5a08a-abec-4164-85c4-1d3ac8398a62",
   "metadata": {},
   "outputs": [],
   "source": [
    "def h(x, w):\n",
    "    \"\"\"x und w sind numpy arrays; x kann auch die komplette Feature-Matrix sein\"\"\"\n",
    "    # Diese Form erlaubt es für x eine ganze (Feature-)Matrix zu übergeben. Die Matrix enthält\n",
    "    # zeilenweise je einen Datenpunkt, für den h berechnet werden soll.\n",
    "    # w @ x.T ist dann ein Vektor mit je einem Ergebnis in den Komponenten des Vektors pro Zeile\n",
    "    # der übergebenen (Feature-)Matrix.\n",
    "    return x @ w\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "c2e256a3-3575-45c4-a99c-d41c3c56e1c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Definition der Kostenfunktion\n",
    "def J(w, X, y):\n",
    "    \"\"\"\n",
    "    w, X, y müssen numpy arrays sein\n",
    "    X: Feature-Matrix aller Trainingsdaten inkl. Spalte mit 1; Dimension: n x (d+1)\n",
    "    y: Vektor aller Targets zu X\n",
    "    \"\"\"\n",
    "    errors = y - h(x=X, w=w)\n",
    "    mse = 1.0/(2.0*len(y)) * ( errors @ errors )\n",
    "    return mse\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "b41b9c03-0c1d-4a6b-80e8-d7e8775b69c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def feature_matrix_from_data(data):\n",
    "    # hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n",
    "    return np.hstack((np.ones((len(data),1)), data.to_numpy(copy=True)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "2c631e17-eb36-43d0-97b1-59add1c93dd9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# hier erzeugen wir die Matrix mit unseren Input-Daten (Features) inklusive der Spalte mit \"1\"\n",
    "#X = np.hstack((np.ones((len(data),1)), data[features].to_numpy(copy=True)))\n",
    "X = feature_matrix_from_data(data[features])\n",
    "# und ausserdem den Vektor der Targets\n",
    "y = data.Price.to_numpy(copy=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "8301eb74-9aae-446c-ad46-924811b99777",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Die 4 Parameter der linearen Regression:\n",
      "[-6.97461781e+07  2.41559504e+05  2.31456611e+03 -1.84562537e+06]\n",
      "Kostenfunktion J(w_ana): 137899453867.5851\n",
      "CPU times: user 475 μs, sys: 38 μs, total: 513 μs\n",
      "Wall time: 500 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "w_ana = np.linalg.solve(X.T @ X, X.T @ y)\n",
    "print('Die {} Parameter der linearen Regression:\\n{}'.format(len(w_ana), w_ana))\n",
    "J_ana = J(w=w_ana, X=X, y=y)\n",
    "print('Kostenfunktion J(w_ana): {}'.format(J_ana))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (ki)",
   "language": "python",
   "name": "myenv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}