ki-dhbw/tasks/01-melbourne.py

#!/usr/bin/env python
# %% Imports
from telnetlib import BM
# imports überall im Code möglich, aber die Konvention ist alle benötigten import statements
# gleich zu Beginn einer Datei zu machen
# numpy ist ein Python-Modul für Numerik, das sowohl Funktionalität als auch Effizienz bietet
import numpy as np
# pandas ist sehr gut zum Arbeiten mit tabellarischen Daten, egal ob csv, xls oder xlsx
from numpy.typing import NDArray as array
from numpy import float64 as float
import pandas as pd
from pandas.core.dtypes.dtypes import time
# plotting settings
pd.plotting.register_matplotlib_converters()
# matplotlib ist ein sehr umfangreiches Modul zum Erstellen von Visualisierungen/Plots
import matplotlib.pyplot as plt
%matplotlib inline
# seaborn erleichtert das Erstellen von oft verwendeten Plot-Typen;
# es basiert selbst auf matplotlib und man kann beides kombinieren
# eine schöne Einführung in Seaborn: https://www.kaggle.com/learn/data-visualization
import seaborn as sns

# %% load data
data = pd.read_csv("../data/melb_data.csv").dropna()
# filter data: Less than 400 area, and max 100 data points
data = data[(data["BuildingArea"] < 400) ][:100][["BuildingArea", "Price"]]
ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
data.head()

# %% prepare data with useless math extra values because we need to add one as a factor to all of these
X = []
Y = []
# aufbereitung, x braucht noch den konstanten eins faktor
for _, row in data.iterrows():
    X.append([1]+ [row['BuildingArea']])
    Y.append(row['Price'])
X = np.array(X)
Y = np.array(Y)

# %% solve the linear thing
w_ana = np.linalg.solve(X.T @ X , X.T @ Y)
print(f"w_ana: {w_ana}")

# %% define that h function, this is just f(x) = mx + b

def h(weights: array[float], x):
    """
    x can be a float or an array because numpy does it all the same
    the return type depends on the type of x
    """
    return weights[0] + weights[1] * x

# %% plot the h function combined with the calculated wieghts


ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])

xplot = [min(data['BuildingArea']), max(data['BuildingArea'])]
yplot = [h(w_ana, x) for x in xplot]
sns.lineplot(x=xplot, y=yplot, ax=ax)
# %% Bewertungsfunktion
def j(weights: array[float], x: array[float] ,y: array[float]) -> float:
    # angeblich sollen x,y UNBEDINGT numpy arrays sein, idk warum, sehen für mich aus wie floats, nicht wie arrays
    errw = y- h(weights, x=y) # pyright hat eigentlich recht, aber irgendwie kann ich doch nen array reinschmeißen auch wenns nen float frisst
    return 1.0/(2.0 * len(errw) * (errw @ errw))
# example usage
j(w_ana, np.array([1.1,1.3]), np.array([2.4,2.6]))
# %% calculate score of analytic approach
# pyright sagt to_numpy ist unknown, ist es aber aus irgendeinem grund nicht, python doof
x = data['BuildingArea'].to_numpy(copy=True)
y = data['Price'].to_numpy(copy=True)
j_ana = j(w_ana, x=x, y=y)
print('Kosten der analytischen Lösung: {}'.format(j_ana))
# %% define grad_dsc functions
# Gradient Descent
# Let's be honest, no idea what I'm really doing here...
def __gradsc_iter(weights: array[float], alpha: float, x: array[float], y: array[float]) -> array[float]:
    errw: array[float] = y - h(x=x, weights=weights) # weis nicht warum aber das geht doch datentypen mäßig
    return np.array([
        weights[0] + alpha / len(x) * sum(errw),
        weights[1] + alpha / len(x) * errw @ x
    ])

def grad_dsc(weights: array[float], alpha: float, x, y, n: int) -> tuple[array[float], array[float]]:
    j_all = [j(weights,x,y)]
    for i in range(n):
        w = __gradsc_iter(weights, alpha, x, y)
        j_all.append(j(weights,x,y))
    return weights, np.array(j_all)
# %% no idea what this is
w_tmp, j_tmp = grad_dsc(np.array([1e5,1000.0]),alpha=1e-9, x=data["BuildingArea"].to_numpy(), y=data["Price"].to_numpy(), n=1)
j_tmp[1] / j_tmp[0]
# %% do the actual gradient descent
w_init = np.array([1e6, 1000.])
x = np.array(data['BuildingArea'])
y = np.array(data['Price'])
w_gd_1e4, J_all_1e4 = grad_dsc(weights=w_init, alpha=np.float64(0.0001), x=x, y=y, n=100000)

print('w_gd_1e4: {}'.format(w_gd_1e4))
print('Vergleich zu Startkosten: {}'.format(J_all_1e4[-1]/J_all_1e4[0]))
print('Vergleich zu analytischer Lösung: {}'.format(J_all_1e4[-1]/j_ana))
print('(w0_gd - w0_ana)/w0_ana: {}'.format((w_gd_1e4[0]-w_ana[0])/w_ana[0]))
print('(w1_gd - w1_ana)/w1_ana: {}'.format((w_gd_1e4[1]-w_ana[1])/w_ana[1]))
# again, no idea what these values mean?