ki-dhbw/tasks/01-melbourne.py

#!/usr/bin/env python
# %% Imports
from telnetlib import BM
# imports überall im Code möglich, aber die Konvention ist alle benötigten import statements
# gleich zu Beginn einer Datei zu machen
# numpy ist ein Python-Modul für Numerik, das sowohl Funktionalität als auch Effizienz bietet
import numpy as np
# pandas ist sehr gut zum Arbeiten mit tabellarischen Daten, egal ob csv, xls oder xlsx
from numpy.typing import NDArray as array
from numpy import float64 as float
import pandas as pd
from pandas.core.dtypes.dtypes import time
# plotting settings
pd.plotting.register_matplotlib_converters()
# matplotlib ist ein sehr umfangreiches Modul zum Erstellen von Visualisierungen/Plots
import matplotlib.pyplot as plt
%matplotlib inline
# seaborn erleichtert das Erstellen von oft verwendeten Plot-Typen;
# es basiert selbst auf matplotlib und man kann beides kombinieren
# eine schöne Einführung in Seaborn: https://www.kaggle.com/learn/data-visualization
import seaborn as sns

# %% load data
data = pd.read_csv("../data/melb_data.csv").dropna()
# filter data: Less than 400 area, and max 100 data points
data = data[(data["BuildingArea"] < 400) ][:100][["BuildingArea", "Price"]]
ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
data.head()

# %% prepare data with useless math extra values because we need to add one as a factor to all of these
X = []
Y = []
# aufbereitung, x braucht noch den konstanten eins faktor
for _, row in data.iterrows():
    X.append([1]+ [row['BuildingArea']])
    Y.append(row['Price'])
X = np.array(X)
Y = np.array(Y)

# %% solve the linear thing
w_ana = np.linalg.solve(X.T @ X , X.T @ Y)
print(f"w_ana: {w_ana}")

# %% define that h function, this is just f(x) = mx + b

def h(weights: array[float], x):
    """
    x can be a float or an array because numpy does it all the same
    the return type depends on the type of x
    """
    return weights[0] + weights[1] * x

# %% plot the h function combined with the calculated wieghts


ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])

xplot = [min(data['BuildingArea']), max(data['BuildingArea'])]
yplot = [h(w_ana, x) for x in xplot]
sns.lineplot(x=xplot, y=yplot, ax=ax)
# %% Bewertungsfunktion
def j(weights: array[float], x: array[float] ,y: array[float]) -> float:
    # angeblich sollen x,y UNBEDINGT numpy arrays sein, idk warum, sehen für mich aus wie floats, nicht wie arrays
    errw = y- h(weights, x=y) # pyright hat eigentlich recht, aber irgendwie kann ich doch nen array reinschmeißen auch wenns nen float frisst
    return 1.0/(2.0 * len(errw) * (errw @ errw))
# example usage
j(w_ana, np.array([1.1,1.3]), np.array([2.4,2.6]))
# %% calculate score of analytic approach
# pyright sagt to_numpy ist unknown, ist es aber aus irgendeinem grund nicht, python doof
x = data['BuildingArea'].to_numpy(copy=True)
y = data['Price'].to_numpy(copy=True)
j_ana = j(w_ana, x=x, y=y)
print('Kosten der analytischen Lösung: {}'.format(j_ana))
# %% define grad_dsc functions
# Gradient Descent
# Let's be honest, no idea what I'm really doing here...
def __gradsc_iter(weights: array[float], alpha: float, x: array[float], y: array[float]) -> array[float]:
    errw: array[float] = y - h(x=x, weights=weights) # weis nicht warum aber das geht doch datentypen mäßig
    return np.array([
        weights[0] + alpha / len(x) * sum(errw),
        weights[1] + alpha / len(x) * errw @ x
    ])

def grad_dsc(weights: array[float], alpha: float, x, y, n: int) -> tuple[array[float], array[float]]:
    j_all = [j(weights,x,y)]
    for i in range(n):
        w = __gradsc_iter(weights, alpha, x, y)
        j_all.append(j(weights,x,y))
    return weights, np.array(j_all)
# %% no idea what this is
w_tmp, j_tmp = grad_dsc(np.array([1e5,1000.0]),alpha=1e-9, x=data["BuildingArea"].to_numpy(), y=data["Price"].to_numpy(), n=1)
j_tmp[1] / j_tmp[0]
# %% do the actual gradient descent
w_init = np.array([1e6, 1000.])
x = np.array(data['BuildingArea'])
y = np.array(data['Price'])
w_gd_1e4, J_all_1e4 = grad_dsc(weights=w_init, alpha=np.float64(0.0001), x=x, y=y, n=100000)

print('w_gd_1e4: {}'.format(w_gd_1e4))
print('Vergleich zu Startkosten: {}'.format(J_all_1e4[-1]/J_all_1e4[0]))
print('Vergleich zu analytischer Lösung: {}'.format(J_all_1e4[-1]/j_ana))
print('(w0_gd - w0_ana)/w0_ana: {}'.format((w_gd_1e4[0]-w_ana[0])/w_ana[0]))
print('(w1_gd - w1_ana)/w1_ana: {}'.format((w_gd_1e4[1]-w_ana[1])/w_ana[1]))
# again, no idea what these values mean?
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`#!/usr/bin/env python`
initial commit 2025-01-16 14:14:58 +01:00			`# %% Imports`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`from telnetlib import BM`
initial commit 2025-01-16 14:14:58 +01:00			`# imports überall im Code möglich, aber die Konvention ist alle benötigten import statements`
			`# gleich zu Beginn einer Datei zu machen`
			`# numpy ist ein Python-Modul für Numerik, das sowohl Funktionalität als auch Effizienz bietet`
			`import numpy as np`
			`# pandas ist sehr gut zum Arbeiten mit tabellarischen Daten, egal ob csv, xls oder xlsx`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`from numpy.typing import NDArray as array`
			`from numpy import float64 as float`
initial commit 2025-01-16 14:14:58 +01:00			`import pandas as pd`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`from pandas.core.dtypes.dtypes import time`
initial commit 2025-01-16 14:14:58 +01:00			`# plotting settings`
			`pd.plotting.register_matplotlib_converters()`
			`# matplotlib ist ein sehr umfangreiches Modul zum Erstellen von Visualisierungen/Plots`
			`import matplotlib.pyplot as plt`
			`%matplotlib inline`
			`# seaborn erleichtert das Erstellen von oft verwendeten Plot-Typen;`
			`# es basiert selbst auf matplotlib und man kann beides kombinieren`
			`# eine schöne Einführung in Seaborn: https://www.kaggle.com/learn/data-visualization`
			`import seaborn as sns`

01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`# %% load data`
initial commit 2025-01-16 14:14:58 +01:00			`data = pd.read_csv("../data/melb_data.csv").dropna()`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`# filter data: Less than 400 area, and max 100 data points`
			`data = data[(data["BuildingArea"] < 400) ][:100][["BuildingArea", "Price"]]`
initial commit 2025-01-16 14:14:58 +01:00			`ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`data.head()`
initial commit 2025-01-16 14:14:58 +01:00
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`# %% prepare data with useless math extra values because we need to add one as a factor to all of these`
01: get analytic values first 2025-01-16 14:21:40 +01:00			`X = []`
			`Y = []`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`# aufbereitung, x braucht noch den konstanten eins faktor`
01: get analytic values first 2025-01-16 14:21:40 +01:00			`for _, row in data.iterrows():`
			`X.append([1]+ [row['BuildingArea']])`
			`Y.append(row['Price'])`
			`X = np.array(X)`
			`Y = np.array(Y)`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00
			`# %% solve the linear thing`
01: get analytic values first 2025-01-16 14:21:40 +01:00			`w_ana = np.linalg.solve(X.T @ X , X.T @ Y)`
01: i tried to do it all, but this is all crap 2025-01-20 15:05:31 +01:00			`print(f"w_ana: {w_ana}")`

			`# %% define that h function, this is just f(x) = mx + b`

			`def h(weights: array[float], x):`
			`"""`
			`x can be a float or an array because numpy does it all the same`
			`the return type depends on the type of x`
			`"""`
			`return weights[0] + weights[1] * x`

			`# %% plot the h function combined with the calculated wieghts`


			`ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])`

			`xplot = [min(data['BuildingArea']), max(data['BuildingArea'])]`
			`yplot = [h(w_ana, x) for x in xplot]`
			`sns.lineplot(x=xplot, y=yplot, ax=ax)`
			`# %% Bewertungsfunktion`
			`def j(weights: array[float], x: array[float] ,y: array[float]) -> float:`
			`# angeblich sollen x,y UNBEDINGT numpy arrays sein, idk warum, sehen für mich aus wie floats, nicht wie arrays`
			`errw = y- h(weights, x=y) # pyright hat eigentlich recht, aber irgendwie kann ich doch nen array reinschmeißen auch wenns nen float frisst`
			`return 1.0/(2.0 * len(errw) * (errw @ errw))`
			`# example usage`
			`j(w_ana, np.array([1.1,1.3]), np.array([2.4,2.6]))`
			`# %% calculate score of analytic approach`
			`# pyright sagt to_numpy ist unknown, ist es aber aus irgendeinem grund nicht, python doof`
			`x = data['BuildingArea'].to_numpy(copy=True)`
			`y = data['Price'].to_numpy(copy=True)`
			`j_ana = j(w_ana, x=x, y=y)`
			`print('Kosten der analytischen Lösung: {}'.format(j_ana))`
			`# %% define grad_dsc functions`
			`# Gradient Descent`
			`# Let's be honest, no idea what I'm really doing here...`
			`def __gradsc_iter(weights: array[float], alpha: float, x: array[float], y: array[float]) -> array[float]:`
			`errw: array[float] = y - h(x=x, weights=weights) # weis nicht warum aber das geht doch datentypen mäßig`
			`return np.array([`
			`weights[0] + alpha / len(x) * sum(errw),`
			`weights[1] + alpha / len(x) * errw @ x`
			`])`

			`def grad_dsc(weights: array[float], alpha: float, x, y, n: int) -> tuple[array[float], array[float]]:`
			`j_all = [j(weights,x,y)]`
			`for i in range(n):`
			`w = __gradsc_iter(weights, alpha, x, y)`
			`j_all.append(j(weights,x,y))`
			`return weights, np.array(j_all)`
			`# %% no idea what this is`
			`w_tmp, j_tmp = grad_dsc(np.array([1e5,1000.0]),alpha=1e-9, x=data["BuildingArea"].to_numpy(), y=data["Price"].to_numpy(), n=1)`
			`j_tmp[1] / j_tmp[0]`
			`# %% do the actual gradient descent`
			`w_init = np.array([1e6, 1000.])`
			`x = np.array(data['BuildingArea'])`
			`y = np.array(data['Price'])`
			`w_gd_1e4, J_all_1e4 = grad_dsc(weights=w_init, alpha=np.float64(0.0001), x=x, y=y, n=100000)`

			`print('w_gd_1e4: {}'.format(w_gd_1e4))`
			`print('Vergleich zu Startkosten: {}'.format(J_all_1e4[-1]/J_all_1e4[0]))`
			`print('Vergleich zu analytischer Lösung: {}'.format(J_all_1e4[-1]/j_ana))`
			`print('(w0_gd - w0_ana)/w0_ana: {}'.format((w_gd_1e4[0]-w_ana[0])/w_ana[0]))`
			`print('(w1_gd - w1_ana)/w1_ana: {}'.format((w_gd_1e4[1]-w_ana[1])/w_ana[1]))`
			`# again, no idea what these values mean?`