01: i tried to do it all, but this is all crap

2025-01-20 15:05:31 +01:00 · 2025-01-20 15:05:31 +01:00 · c0f7e9d339
commit c0f7e9d339
parent 0f655c8719
7 changed files with 2344 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1 @@
-notebooks/.ipynb_checkpoints
+**/.ipynb_checkpoints
--- a/Kurzeinführung.ipynb
+++ b/Kurzeinführung.ipynb
--- a/feature.ipynb
+++ b/feature.ipynb
--- a/features.ipynb
+++ b/features.ipynb
--- a/notebooks/data
+++ b/notebooks/data
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+Note: ich mach das zeug mit Zed repl
--- a/tasks/01-melbourne.py
+++ b/tasks/01-melbourne.py
@ -1,12 +1,15 @@
-from telnetlib import BM
+#!/usr/bin/env python
 # %% Imports
+from telnetlib import BM
 # imports überall im Code möglich, aber die Konvention ist alle benötigten import statements
 # gleich zu Beginn einer Datei zu machen
-
 # numpy ist ein Python-Modul für Numerik, das sowohl Funktionalität als auch Effizienz bietet
 import numpy as np
 # pandas ist sehr gut zum Arbeiten mit tabellarischen Daten, egal ob csv, xls oder xlsx
+from numpy.typing import NDArray as array
+from numpy import float64 as float
 import pandas as pd
+from pandas.core.dtypes.dtypes import time
 # plotting settings
 pd.plotting.register_matplotlib_converters()
 # matplotlib ist ein sehr umfangreiches Modul zum Erstellen von Visualisierungen/Plots
@ -17,23 +20,85 @@ import matplotlib.pyplot as plt
 # eine schöne Einführung in Seaborn: https://www.kaggle.com/learn/data-visualization
 import seaborn as sns

-
-# %% Data
+# %% load data
 data = pd.read_csv("../data/melb_data.csv").dropna()
-data = data[(data["BuildingArea"] < 1000) ]
+# filter data: Less than 400 area, and max 100 data points
+data = data[(data["BuildingArea"] < 400) ][:100][["BuildingArea", "Price"]]
 ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
-# ax.set(xlim=(0, 1000)) # brauch ich nicht mehr wenn ich die outlier aus den daten rausschmeiße
+data.head()

-
-# %% linear regression
+# %% prepare data with useless math extra values because we need to add one as a factor to all of these
 X = []
 Y = []
+# aufbereitung, x braucht noch den konstanten eins faktor
 for _, row in data.iterrows():
    X.append([1]+ [row['BuildingArea']])
    Y.append(row['Price'])
 X = np.array(X)
 Y = np.array(Y)
-# aber das ist noch nicht die fertige eingabe, da fehlt die konstante 1!
-# und mit Y ist auch irgendwas :(
+
+# %% solve the linear thing
 w_ana = np.linalg.solve(X.T @ X , X.T @ Y)
-w_ana
+print(f"w_ana: {w_ana}")
+
+# %% define that h function, this is just f(x) = mx + b
+
+def h(weights: array[float], x):
+    """
+    x can be a float or an array because numpy does it all the same
+    the return type depends on the type of x
+    """
+    return weights[0] + weights[1] * x
+
+# %% plot the h function combined with the calculated wieghts
+
+
+ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
+
+xplot = [min(data['BuildingArea']), max(data['BuildingArea'])]
+yplot = [h(w_ana, x) for x in xplot]
+sns.lineplot(x=xplot, y=yplot, ax=ax)
+# %% Bewertungsfunktion
+def j(weights: array[float], x: array[float] ,y: array[float]) -> float:
+    # angeblich sollen x,y UNBEDINGT numpy arrays sein, idk warum, sehen für mich aus wie floats, nicht wie arrays
+    errw = y- h(weights, x=y) # pyright hat eigentlich recht, aber irgendwie kann ich doch nen array reinschmeißen auch wenns nen float frisst
+    return 1.0/(2.0 * len(errw) * (errw @ errw))
+# example usage
+j(w_ana, np.array([1.1,1.3]), np.array([2.4,2.6]))
+# %% calculate score of analytic approach
+# pyright sagt to_numpy ist unknown, ist es aber aus irgendeinem grund nicht, python doof
+x = data['BuildingArea'].to_numpy(copy=True)
+y = data['Price'].to_numpy(copy=True)
+j_ana = j(w_ana, x=x, y=y)
+print('Kosten der analytischen Lösung: {}'.format(j_ana))
+# %% define grad_dsc functions
+# Gradient Descent
+# Let's be honest, no idea what I'm really doing here...
+def __gradsc_iter(weights: array[float], alpha: float, x: array[float], y: array[float]) -> array[float]:
+    errw: array[float] = y - h(x=x, weights=weights) # weis nicht warum aber das geht doch datentypen mäßig
+    return np.array([
+        weights[0] + alpha / len(x) * sum(errw),
+        weights[1] + alpha / len(x) * errw @ x
+    ])
+
+def grad_dsc(weights: array[float], alpha: float, x, y, n: int) -> tuple[array[float], array[float]]:
+    j_all = [j(weights,x,y)]
+    for i in range(n):
+        w = __gradsc_iter(weights, alpha, x, y)
+        j_all.append(j(weights,x,y))
+    return weights, np.array(j_all)
+# %% no idea what this is
+w_tmp, j_tmp = grad_dsc(np.array([1e5,1000.0]),alpha=1e-9, x=data["BuildingArea"].to_numpy(), y=data["Price"].to_numpy(), n=1)
+j_tmp[1] / j_tmp[0]
+# %% do the actual gradient descent
+w_init = np.array([1e6, 1000.])
+x = np.array(data['BuildingArea'])
+y = np.array(data['Price'])
+w_gd_1e4, J_all_1e4 = grad_dsc(weights=w_init, alpha=np.float64(0.0001), x=x, y=y, n=100000)
+
+print('w_gd_1e4: {}'.format(w_gd_1e4))
+print('Vergleich zu Startkosten: {}'.format(J_all_1e4[-1]/J_all_1e4[0]))
+print('Vergleich zu analytischer Lösung: {}'.format(J_all_1e4[-1]/j_ana))
+print('(w0_gd - w0_ana)/w0_ana: {}'.format((w_gd_1e4[0]-w_ana[0])/w_ana[0]))
+print('(w1_gd - w1_ana)/w1_ana: {}'.format((w_gd_1e4[1]-w_ana[1])/w_ana[1]))
+# again, no idea what these values mean?