01: i tried to do it all, but this is all crap

2025-01-20 15:05:31 +01:00 · 2025-01-20 15:05:31 +01:00 · c0f7e9d339
commit c0f7e9d339
parent 0f655c8719
7 changed files with 2344 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1 @@
-notebooks/.ipynb_checkpoints
+**/.ipynb_checkpoints
--- a/Kurzeinführung.ipynb
+++ b/Kurzeinführung.ipynb
--- a/feature.ipynb
+++ b/feature.ipynb
--- a/features.ipynb
+++ b/features.ipynb
--- a/notebooks/data
+++ b/notebooks/data
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
 Note: ich mach das zeug mit Zed repl
--- a/tasks/01-melbourne.py
+++ b/tasks/01-melbourne.py
@ -1,12 +1,15 @@
-from telnetlib import BM
+#!/usr/bin/env python
 # %% Imports
 from telnetlib import BM
 # imports überall im Code möglich, aber die Konvention ist alle benötigten import statements
 # gleich zu Beginn einer Datei zu machen
 # numpy ist ein Python-Modul für Numerik, das sowohl Funktionalität als auch Effizienz bietet
 import numpy as np
 # pandas ist sehr gut zum Arbeiten mit tabellarischen Daten, egal ob csv, xls oder xlsx
 from numpy.typing import NDArray as array
 from numpy import float64 as float
 import pandas as pd
 from pandas.core.dtypes.dtypes import time
 # plotting settings
 pd.plotting.register_matplotlib_converters()
 # matplotlib ist ein sehr umfangreiches Modul zum Erstellen von Visualisierungen/Plots
@ -17,23 +20,85 @@ import matplotlib.pyplot as plt
 # eine schöne Einführung in Seaborn: https://www.kaggle.com/learn/data-visualization
 import seaborn as sns
-
+# %% load data
 # %% Data
 data = pd.read_csv("../data/melb_data.csv").dropna()
-data = data[(data["BuildingArea"] < 1000) ]
+# filter data: Less than 400 area, and max 100 data points
 data = data[(data["BuildingArea"] < 400) ][:100][["BuildingArea", "Price"]]
 ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
-# ax.set(xlim=(0, 1000)) # brauch ich nicht mehr wenn ich die outlier aus den daten rausschmeiße
+data.head()
-
+# %% prepare data with useless math extra values because we need to add one as a factor to all of these
 # %% linear regression
 X = []
 Y = []
 # aufbereitung, x braucht noch den konstanten eins faktor
 for _, row in data.iterrows():
    X.append([1]+ [row['BuildingArea']])
    Y.append(row['Price'])
 X = np.array(X)
 Y = np.array(Y)
-# aber das ist noch nicht die fertige eingabe, da fehlt die konstante 1!
+
-# und mit Y ist auch irgendwas :(
+# %% solve the linear thing
 w_ana = np.linalg.solve(X.T @ X , X.T @ Y)
-w_ana
+print(f"w_ana: {w_ana}")
 # %% define that h function, this is just f(x) = mx + b
 def h(weights: array[float], x):
    """
    x can be a float or an array because numpy does it all the same
    the return type depends on the type of x
    """
    return weights[0] + weights[1] * x
 # %% plot the h function combined with the calculated wieghts
 ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
 xplot = [min(data['BuildingArea']), max(data['BuildingArea'])]
 yplot = [h(w_ana, x) for x in xplot]
 sns.lineplot(x=xplot, y=yplot, ax=ax)
 # %% Bewertungsfunktion
 def j(weights: array[float], x: array[float] ,y: array[float]) -> float:
    # angeblich sollen x,y UNBEDINGT numpy arrays sein, idk warum, sehen für mich aus wie floats, nicht wie arrays
    errw = y- h(weights, x=y) # pyright hat eigentlich recht, aber irgendwie kann ich doch nen array reinschmeißen auch wenns nen float frisst
    return 1.0/(2.0 * len(errw) * (errw @ errw))
 # example usage
 j(w_ana, np.array([1.1,1.3]), np.array([2.4,2.6]))
 # %% calculate score of analytic approach
 # pyright sagt to_numpy ist unknown, ist es aber aus irgendeinem grund nicht, python doof
 x = data['BuildingArea'].to_numpy(copy=True)
 y = data['Price'].to_numpy(copy=True)
 j_ana = j(w_ana, x=x, y=y)
 print('Kosten der analytischen Lösung: {}'.format(j_ana))
 # %% define grad_dsc functions
 # Gradient Descent
 # Let's be honest, no idea what I'm really doing here...
 def __gradsc_iter(weights: array[float], alpha: float, x: array[float], y: array[float]) -> array[float]:
    errw: array[float] = y - h(x=x, weights=weights) # weis nicht warum aber das geht doch datentypen mäßig
    return np.array([
        weights[0] + alpha / len(x) * sum(errw),
        weights[1] + alpha / len(x) * errw @ x
    ])
 def grad_dsc(weights: array[float], alpha: float, x, y, n: int) -> tuple[array[float], array[float]]:
    j_all = [j(weights,x,y)]
    for i in range(n):
        w = __gradsc_iter(weights, alpha, x, y)
        j_all.append(j(weights,x,y))
    return weights, np.array(j_all)
 # %% no idea what this is
 w_tmp, j_tmp = grad_dsc(np.array([1e5,1000.0]),alpha=1e-9, x=data["BuildingArea"].to_numpy(), y=data["Price"].to_numpy(), n=1)
 j_tmp[1] / j_tmp[0]
 # %% do the actual gradient descent
 w_init = np.array([1e6, 1000.])
 x = np.array(data['BuildingArea'])
 y = np.array(data['Price'])
 w_gd_1e4, J_all_1e4 = grad_dsc(weights=w_init, alpha=np.float64(0.0001), x=x, y=y, n=100000)
 print('w_gd_1e4: {}'.format(w_gd_1e4))
 print('Vergleich zu Startkosten: {}'.format(J_all_1e4[-1]/J_all_1e4[0]))
 print('Vergleich zu analytischer Lösung: {}'.format(J_all_1e4[-1]/j_ana))
 print('(w0_gd - w0_ana)/w0_ana: {}'.format((w_gd_1e4[0]-w_ana[0])/w_ana[0]))
 print('(w1_gd - w1_ana)/w1_ana: {}'.format((w_gd_1e4[1]-w_ana[1])/w_ana[1]))
 # again, no idea what these values mean?
`@ -1 +1 @@`
	`notebooks/.ipynb_checkpoints`	`**/.ipynb_checkpoints`