01: i tried to do it all, but this is all crap
This commit is contained in:
parent
0f655c8719
commit
c0f7e9d339
|
@ -1 +1 @@
|
|||
notebooks/.ipynb_checkpoints
|
||||
**/.ipynb_checkpoints
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,12 +1,15 @@
|
|||
from telnetlib import BM
|
||||
#!/usr/bin/env python
|
||||
# %% Imports
|
||||
from telnetlib import BM
|
||||
# imports überall im Code möglich, aber die Konvention ist alle benötigten import statements
|
||||
# gleich zu Beginn einer Datei zu machen
|
||||
|
||||
# numpy ist ein Python-Modul für Numerik, das sowohl Funktionalität als auch Effizienz bietet
|
||||
import numpy as np
|
||||
# pandas ist sehr gut zum Arbeiten mit tabellarischen Daten, egal ob csv, xls oder xlsx
|
||||
from numpy.typing import NDArray as array
|
||||
from numpy import float64 as float
|
||||
import pandas as pd
|
||||
from pandas.core.dtypes.dtypes import time
|
||||
# plotting settings
|
||||
pd.plotting.register_matplotlib_converters()
|
||||
# matplotlib ist ein sehr umfangreiches Modul zum Erstellen von Visualisierungen/Plots
|
||||
|
@ -17,23 +20,85 @@ import matplotlib.pyplot as plt
|
|||
# eine schöne Einführung in Seaborn: https://www.kaggle.com/learn/data-visualization
|
||||
import seaborn as sns
|
||||
|
||||
|
||||
# %% Data
|
||||
# %% load data
|
||||
data = pd.read_csv("../data/melb_data.csv").dropna()
|
||||
data = data[(data["BuildingArea"] < 1000) ]
|
||||
# filter data: Less than 400 area, and max 100 data points
|
||||
data = data[(data["BuildingArea"] < 400) ][:100][["BuildingArea", "Price"]]
|
||||
ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
|
||||
# ax.set(xlim=(0, 1000)) # brauch ich nicht mehr wenn ich die outlier aus den daten rausschmeiße
|
||||
data.head()
|
||||
|
||||
|
||||
# %% linear regression
|
||||
# %% prepare data with useless math extra values because we need to add one as a factor to all of these
|
||||
X = []
|
||||
Y = []
|
||||
# aufbereitung, x braucht noch den konstanten eins faktor
|
||||
for _, row in data.iterrows():
|
||||
X.append([1]+ [row['BuildingArea']])
|
||||
Y.append(row['Price'])
|
||||
X = np.array(X)
|
||||
Y = np.array(Y)
|
||||
# aber das ist noch nicht die fertige eingabe, da fehlt die konstante 1!
|
||||
# und mit Y ist auch irgendwas :(
|
||||
|
||||
# %% solve the linear thing
|
||||
w_ana = np.linalg.solve(X.T @ X , X.T @ Y)
|
||||
w_ana
|
||||
print(f"w_ana: {w_ana}")
|
||||
|
||||
# %% define that h function, this is just f(x) = mx + b
|
||||
|
||||
def h(weights: array[float], x):
|
||||
"""
|
||||
x can be a float or an array because numpy does it all the same
|
||||
the return type depends on the type of x
|
||||
"""
|
||||
return weights[0] + weights[1] * x
|
||||
|
||||
# %% plot the h function combined with the calculated wieghts
|
||||
|
||||
|
||||
ax = sns.scatterplot(x=data['BuildingArea'], y=data['Price'])
|
||||
|
||||
xplot = [min(data['BuildingArea']), max(data['BuildingArea'])]
|
||||
yplot = [h(w_ana, x) for x in xplot]
|
||||
sns.lineplot(x=xplot, y=yplot, ax=ax)
|
||||
# %% Bewertungsfunktion
|
||||
def j(weights: array[float], x: array[float] ,y: array[float]) -> float:
|
||||
# angeblich sollen x,y UNBEDINGT numpy arrays sein, idk warum, sehen für mich aus wie floats, nicht wie arrays
|
||||
errw = y- h(weights, x=y) # pyright hat eigentlich recht, aber irgendwie kann ich doch nen array reinschmeißen auch wenns nen float frisst
|
||||
return 1.0/(2.0 * len(errw) * (errw @ errw))
|
||||
# example usage
|
||||
j(w_ana, np.array([1.1,1.3]), np.array([2.4,2.6]))
|
||||
# %% calculate score of analytic approach
|
||||
# pyright sagt to_numpy ist unknown, ist es aber aus irgendeinem grund nicht, python doof
|
||||
x = data['BuildingArea'].to_numpy(copy=True)
|
||||
y = data['Price'].to_numpy(copy=True)
|
||||
j_ana = j(w_ana, x=x, y=y)
|
||||
print('Kosten der analytischen Lösung: {}'.format(j_ana))
|
||||
# %% define grad_dsc functions
|
||||
# Gradient Descent
|
||||
# Let's be honest, no idea what I'm really doing here...
|
||||
def __gradsc_iter(weights: array[float], alpha: float, x: array[float], y: array[float]) -> array[float]:
|
||||
errw: array[float] = y - h(x=x, weights=weights) # weis nicht warum aber das geht doch datentypen mäßig
|
||||
return np.array([
|
||||
weights[0] + alpha / len(x) * sum(errw),
|
||||
weights[1] + alpha / len(x) * errw @ x
|
||||
])
|
||||
|
||||
def grad_dsc(weights: array[float], alpha: float, x, y, n: int) -> tuple[array[float], array[float]]:
|
||||
j_all = [j(weights,x,y)]
|
||||
for i in range(n):
|
||||
w = __gradsc_iter(weights, alpha, x, y)
|
||||
j_all.append(j(weights,x,y))
|
||||
return weights, np.array(j_all)
|
||||
# %% no idea what this is
|
||||
w_tmp, j_tmp = grad_dsc(np.array([1e5,1000.0]),alpha=1e-9, x=data["BuildingArea"].to_numpy(), y=data["Price"].to_numpy(), n=1)
|
||||
j_tmp[1] / j_tmp[0]
|
||||
# %% do the actual gradient descent
|
||||
w_init = np.array([1e6, 1000.])
|
||||
x = np.array(data['BuildingArea'])
|
||||
y = np.array(data['Price'])
|
||||
w_gd_1e4, J_all_1e4 = grad_dsc(weights=w_init, alpha=np.float64(0.0001), x=x, y=y, n=100000)
|
||||
|
||||
print('w_gd_1e4: {}'.format(w_gd_1e4))
|
||||
print('Vergleich zu Startkosten: {}'.format(J_all_1e4[-1]/J_all_1e4[0]))
|
||||
print('Vergleich zu analytischer Lösung: {}'.format(J_all_1e4[-1]/j_ana))
|
||||
print('(w0_gd - w0_ana)/w0_ana: {}'.format((w_gd_1e4[0]-w_ana[0])/w_ana[0]))
|
||||
print('(w1_gd - w1_ana)/w1_ana: {}'.format((w_gd_1e4[1]-w_ana[1])/w_ana[1]))
|
||||
# again, no idea what these values mean?
|
||||
|
|
Loading…
Reference in New Issue