import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import mglearn
import seaborn as sns
Übung 3
Decision Trees, Dummy Features, Zeitreihenanalyse, PCA
Python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, ward
# set default values for all plotting:
=12
size'axes.labelsize'] = size
plt.rcParams['xtick.labelsize'] = size
plt.rcParams['ytick.labelsize'] = size
plt.rcParams['legend.fontsize'] = size
plt.rcParams['figure.figsize'] = (6.29, 6/10*6.29)
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['axes.grid'] = True
plt.rcParams[# print(plt.rcParams)
# import locale #should you want german notation for numbers, then use the locale package
# locale.setlocale(locale.LC_ALL, "deu_deu")
# plt.rcParams['axes.formatter.use_locale'] = True
# Stylefile
# plt.style.use('C:/Users/edel/Documents/Python Scripts/Stylefile/custom_figure_style.mplstyle')
Decision Trees
Übung 1: Hotel-Reservations Datensatz mit Forest Classifiern
- Laden Sie den Hotel Reservations Datensatz von Kaggle herunter.
- Betrachten Sie den Datensatz und ersetzen Sie nicht-numerische Features durch Dummy-Features.
- Ermitteln Sie die vier wichtigsten Features des Datensatzes anhand der Korrelationsmatrix und erstellen Sie sinnhafte Darstellungen zwischen dem jeweiligen Feature und dem Target.
- Verwenden Sie den Datensatz und untersuchen Sie die Performance der Ensembles of Decision Tree Algorithmen für unterschiedliche Parameter bei einem fixen Train-Test-Split (random_state=10).
- Erstellen Sie einen Subplot, der 3 Diagramme im Format 3x1 enthält. Plotten Sie jeweils den Testscore in Abhängigkeit des Hyperparameters (max_depth, n_estimators, learning_rate) für die drei untersuchten Algorithmen.
Lösung:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
=pd.read_csv('daten/Hotel Reservations.csv')
df_hotel=df_hotel.drop(columns=['Booking_ID'])
df_hotel=pd.get_dummies(df_hotel)
df_h_dummified=df_h_dummified.drop(columns=['booking_status_Canceled','booking_status_Not_Canceled'])
X=df_h_dummified['booking_status_Canceled']
y=df_h_dummified.corr().booking_status_Canceled
corrs=abs(corrs)
corrs=corrs.sort_values(ascending=False)
corrs_sorted#print(corrs_sorted)
=df_h_dummified[['lead_time','arrival_year','avg_price_per_room','no_of_special_requests','booking_status_Canceled']] #highly correlated features X_hc
=['LT','Year','Price','Specialties','Canceled']
X_hc.columns=pd.plotting.scatter_matrix(X_hc[::20],
sm=(15, 12), marker='.',
figsize={'bins': 30,'color':'grey'}, s=30, alpha=.8,color='red',
hist_kwds=plt.get_cmap('coolwarm'))
cmap
#y ticklabels
'size', 14) for item in sm.ravel()]
[plt.setp(item.yaxis.get_majorticklabels(), #x ticklabels
'size', 14,'rotation',0) for item in sm.ravel()]
[plt.setp(item.xaxis.get_majorticklabels(), #y labels
'size', 14, 'rotation',0,'ha','right') for item in sm.ravel()]
[plt.setp(item.yaxis.get_label(), #x labels
'size',14) for item in sm.ravel()]
[plt.setp(item.xaxis.get_label(),
plt.tight_layout()False) plt.grid(
/usr/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:97: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
ax.scatter(
plt.figure()==1],color='black',fill=True,alpha=.2,label='Canceled')
sns.kdeplot(X_hc.LT[X_hc.Canceled==0],color='red',fill=True,alpha=.2,label='Not Canceled')
sns.kdeplot(X_hc.LT[X_hc.Canceled
plt.tight_layout()'Tage Buchung vor Reiseantritt')
plt.xlabel('Häufigkeit')
plt.ylabel(
plt.legend()False) plt.grid(
plt.figure()==1],color='black',fill=True,alpha=.2,label='Canceled')
sns.kdeplot(X_hc.Year[X_hc.Canceled==0],color='red',fill=True,alpha=.2,label='Not Canceled')
sns.kdeplot(X_hc.Year[X_hc.Canceled2017,2018])
plt.xticks([
plt.tight_layout()'Jahr')
plt.xlabel('Häufigkeit')
plt.ylabel(0,10.01,2))
plt.yticks(np.arange(=2)
plt.legend(locFalse) plt.grid(
plt.figure()==1],color='black',fill=True,alpha=.2,label='Canceled')
sns.kdeplot(X_hc.Price[X_hc.Canceled==0],color='red',fill=True,alpha=.2,label='Not Canceled')
sns.kdeplot(X_hc.Price[X_hc.Canceled
plt.tight_layout()'Preis')
plt.xlabel('Häufigkeit')
plt.ylabel(
plt.legend()False) plt.grid(
plt.figure()==1],color='black',fill=True,alpha=.2,label='Canceled')
sns.kdeplot(X_hc.Specialties[X_hc.Canceled==0],color='red',fill=True,alpha=.2,label='Not Canceled')
sns.kdeplot(X_hc.Specialties[X_hc.Canceled
plt.tight_layout()'Anzahl Sonderwünsche')
plt.xlabel('Häufigkeit')
plt.ylabel(
plt.legend()False) plt.grid(
= train_test_split(
X_train, X_test, y_train, y_test =10) X, y, random_state
#Untersuchen der Performance
= DecisionTreeClassifier(max_depth=3)
algo #algo = RandomForestClassifier(n_estimators=5, random_state=2)
#algo = GradientBoostingClassifier(learning_rate=0.001)
algo.fit(X_train, y_train)
print(f"Accuracy on training set: {algo.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {algo.score(X_test, y_test):.3f}")
Accuracy on training set: 0.786
Accuracy on test set: 0.787
# Berechnung der einzelnen Test Scores
='max_depth'
DT_string='n_estimators'
RF_string='learning_rate'
GB_string=np.arange(1,20,1)
m_d=np.arange(1,32,2)
n_est=np.logspace(-3,3,7)
l_r
def param_var(algo,DT_stringparam,param):
=[]
test_scoresfor i in range(len(param)):
={DT_string:m_d[i]}
params**params)
algo.set_params(
algo.fit(X_train,y_train)
test_scores.append(algo.score(X_test,y_test))return test_scores
=param_var(DecisionTreeClassifier(),DT_string,m_d)
test_scores_dt=param_var(RandomForestClassifier(),RF_string,n_est)
test_scores_rf=param_var(GradientBoostingClassifier(random_state=2),GB_string,l_r) test_scores_gb
=plt.figure(figsize=(6.29,1.8*6.29))
fig
=fig.add_subplot(3,1,1)
ax1=fig.add_subplot(3,1,2)
ax2=fig.add_subplot(3,1,3)
ax3
='solid',marker='o',color='black',label='Decision Tree')
ax1.plot(m_d,test_scores_dt,ls'Max depth')
ax1.set_xlabel('Testscore')
ax1.set_ylabel(0,1)
ax1.set_ylim(0,20.1,2))
ax1.set_xticks(np.arange(True)
ax1.grid(=4)
ax1.legend(loc
='solid',marker='o',color='black',label='Random Forest')
ax2.plot(n_est,test_scores_rf,ls'Number of estimators')
ax2.set_xlabel('Testscore')
ax2.set_ylabel(0,1)
ax2.set_ylim(True)
ax2.grid(=4)
ax2.legend(loc
='solid',marker='o',color='black',label='Gradient Boosting')
ax3.semilogx(l_r,test_scores_gb,ls'Learning rate')
ax3.set_xlabel('Testscore')
ax3.set_ylabel(0,1)
ax3.set_ylim(True)
ax3.grid(=4)
ax3.legend(loc
fig.tight_layout()
Aufgabe 1: 24h-Strompreise
Untersuchen Sie die 24h-Strompreise der Datei dshistory2017.xls
der EXAA (Energy Exchange Austria bzgl. der Frage: “Wie gut kann man aus der Uhrzeit (Stunde 1, 2, …, 24) und dem Wochentag (Mo, Di, …) vorhersagen, ob der Strompreis Price (EUR)
über oder unter dem Tagesmittel liegt?”
- Machen Sie sich zuerst passende Abbildungen der Daten.
- Bringen Sie die Daten in die übliche \(X\)-\(y\)-Form.
- Vergleichen Sie verschiedene Algorithmen.
Lösung:
= pd.read_excel('daten/dshistory2017.xls', sheet_name='Price (EUR)',
price =0, skiprows=1)
index_col= price.iloc[:,:24].copy()
df
df.shape# check=df.applymap(np.isreal)
#print(df.dtypes)
=df[df.applymap(np.isreal).all('columns')] #Only if all columns in a row are real values the row is not dropped
df=True) #dropNANs
df.dropna(inplaceprint(df.shape)
(363, 24)
3) df.head(
hEXA01 | hEXA02 | hEXA03 | hEXA04 | hEXA05 | hEXA06 | hEXA07 | hEXA08 | hEXA09 | hEXA10 | ... | hEXA15 | hEXA16 | hEXA17 | hEXA18 | hEXA19 | hEXA20 | hEXA21 | hEXA22 | hEXA23 | hEXA24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Delivery Date | |||||||||||||||||||||
2017-12-31 | -20.1 | -28.2 | -38.78 | -25.63 | -17.60 | -19.93 | -27.88 | -17.35 | -4.99 | 1.41 | ... | -36.82 | -20.78 | 3.02 | 14.18 | 13.28 | 7.34 | 0.04 | 1.91 | 10.05 | 5.53 |
2017-12-30 | 15.5 | 10.7 | 8.20 | 8.20 | 9.40 | 10.30 | 8.81 | 10.99 | 17.49 | 23.22 | ... | 16.00 | 14.90 | 15.54 | 18.71 | 17.00 | 14.31 | 6.78 | 1.69 | 0.55 | -6.87 |
2017-12-29 | 7.2 | 4.89 | 3.56 | 3.23 | 3.84 | 9.13 | 15.32 | 26.06 | 30.99 | 32.22 | ... | 31.20 | 34.60 | 38.96 | 40.54 | 38.20 | 34.12 | 30.58 | 28.80 | 28.61 | 19.86 |
3 rows × 24 columns
# Maximale Tageswerte:
plt.figure()max(axis=1),color='red')
plt.plot(df.0,150)
plt.ylim('Datum (YYYY-MM)')
plt.xlabel('Preis (€/MWh)')
plt.ylabel(
plt.tight_layout()
print(df.max(axis=1).max())
143.1
Falschfarbenbild der Rohdaten, colormaps
= np.array([[1,2],
X 3,4],
[5,6]])
[=plt.cm.YlOrRd, extent=[0, 24, 0, 36.3], aspect='auto')
plt.imshow(X, cmap; plt.colorbar()
= df.values.astype(float) Z
plt.figure()='bilinear', # compare with interpolation=None
plt.imshow(Z, interpolation=plt.cm.coolwarm, extent=[0, 24, 0, 363], aspect='auto');
cmap'Stunde')
plt.xlabel('Tag')
plt.ylabel(
plt.colorbar()False); plt.grid(
'mean'] = df.apply(np.mean, axis=1) #Mittelwerte
df['day'] = df.index.dayofweek # Monday=0, Sunday=6
df[3) df.head(
hEXA01 | hEXA02 | hEXA03 | hEXA04 | hEXA05 | hEXA06 | hEXA07 | hEXA08 | hEXA09 | hEXA10 | ... | hEXA17 | hEXA18 | hEXA19 | hEXA20 | hEXA21 | hEXA22 | hEXA23 | hEXA24 | mean | day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Delivery Date | |||||||||||||||||||||
2017-12-31 | -20.1 | -28.2 | -38.78 | -25.63 | -17.60 | -19.93 | -27.88 | -17.35 | -4.99 | 1.41 | ... | 3.02 | 14.18 | 13.28 | 7.34 | 0.04 | 1.91 | 10.05 | 5.53 | -12.000833 | 6 |
2017-12-30 | 15.5 | 10.7 | 8.20 | 8.20 | 9.40 | 10.30 | 8.81 | 10.99 | 17.49 | 23.22 | ... | 15.54 | 18.71 | 17.00 | 14.31 | 6.78 | 1.69 | 0.55 | -6.87 | 13.447917 | 5 |
2017-12-29 | 7.2 | 4.89 | 3.56 | 3.23 | 3.84 | 9.13 | 15.32 | 26.06 | 30.99 | 32.22 | ... | 38.96 | 40.54 | 38.20 | 34.12 | 30.58 | 28.80 | 28.61 | 19.86 | 24.196667 | 4 |
3 rows × 26 columns
# binäre Abweichung vom Tagesmittel:
for h in range(1, 25):
= df.iloc[:,h-1] - df['mean']
df[h] # for i in range(len(df['mean'])):
# if df.iloc[i,h+25]>0:
# df.iloc[i,h+25]=1
# else:
# df.iloc[i,h+25]=0
= df[h].apply(lambda x: 1 if x > 0 else 0) df[h]
3) df.head(
hEXA01 | hEXA02 | hEXA03 | hEXA04 | hEXA05 | hEXA06 | hEXA07 | hEXA08 | hEXA09 | hEXA10 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Delivery Date | |||||||||||||||||||||
2017-12-31 | -20.1 | -28.2 | -38.78 | -25.63 | -17.60 | -19.93 | -27.88 | -17.35 | -4.99 | 1.41 | ... | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
2017-12-30 | 15.5 | 10.7 | 8.20 | 8.20 | 9.40 | 10.30 | 8.81 | 10.99 | 17.49 | 23.22 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
2017-12-29 | 7.2 | 4.89 | 3.56 | 3.23 | 3.84 | 9.13 | 15.32 | 26.06 | 30.99 | 32.22 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 |
3 rows × 50 columns
= df.melt(id_vars=['day'], value_vars=range(1,25))
my_df 3))
display(my_df.head(3)) display(my_df.tail(
day | variable | value | |
---|---|---|---|
0 | 6 | 1 | 0 |
1 | 5 | 1 | 1 |
2 | 4 | 1 | 0 |
day | variable | value | |
---|---|---|---|
8709 | 1 | 24 | 0 |
8710 | 0 | 24 | 0 |
8711 | 6 | 24 | 0 |
={"variable":"hour", "value":"above"}, inplace=True)
my_df.rename(columns3) my_df.head(
day | hour | above | |
---|---|---|---|
0 | 6 | 1 | 0 |
1 | 5 | 1 | 1 |
2 | 4 | 1 | 0 |
= pd.pivot_table(my_df, index='day', columns='hour',
my_pred ='above', aggfunc=np.mean)
values=False, inplace=True)
my_pred.sort_index(ascending my_pred
hour | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
day | |||||||||||||||||||||
6 | 0.627451 | 0.411765 | 0.176471 | 0.098039 | 0.137255 | 0.098039 | 0.078431 | 0.156863 | 0.333333 | 0.411765 | ... | 0.176471 | 0.215686 | 0.411765 | 0.666667 | 1.000000 | 1.000000 | 0.980392 | 0.901961 | 0.921569 | 0.607843 |
5 | 0.519231 | 0.269231 | 0.192308 | 0.173077 | 0.115385 | 0.134615 | 0.115385 | 0.423077 | 0.846154 | 0.923077 | ... | 0.134615 | 0.250000 | 0.365385 | 0.750000 | 1.000000 | 0.980769 | 0.788462 | 0.634615 | 0.634615 | 0.423077 |
4 | 0.076923 | 0.019231 | 0.000000 | 0.000000 | 0.000000 | 0.038462 | 0.538462 | 1.000000 | 1.000000 | 1.000000 | ... | 0.326923 | 0.384615 | 0.480769 | 0.846154 | 1.000000 | 0.980769 | 0.846154 | 0.615385 | 0.538462 | 0.192308 |
3 | 0.076923 | 0.038462 | 0.019231 | 0.019231 | 0.019231 | 0.000000 | 0.576923 | 0.980769 | 1.000000 | 0.980769 | ... | 0.365385 | 0.442308 | 0.538462 | 0.903846 | 0.980769 | 1.000000 | 0.769231 | 0.673077 | 0.519231 | 0.173077 |
2 | 0.076923 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.576923 | 1.000000 | 1.000000 | 1.000000 | ... | 0.326923 | 0.365385 | 0.461538 | 0.903846 | 0.961538 | 0.961538 | 0.903846 | 0.576923 | 0.423077 | 0.115385 |
1 | 0.019231 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.557692 | 0.961538 | 0.980769 | 0.961538 | ... | 0.365385 | 0.384615 | 0.480769 | 0.903846 | 1.000000 | 0.980769 | 0.826923 | 0.692308 | 0.519231 | 0.096154 |
0 | 0.038462 | 0.019231 | 0.019231 | 0.019231 | 0.019231 | 0.019231 | 0.653846 | 0.942308 | 0.903846 | 0.942308 | ... | 0.403846 | 0.480769 | 0.634615 | 0.923077 | 1.000000 | 1.000000 | 0.961538 | 0.788462 | 0.615385 | 0.134615 |
7 rows × 24 columns
plt.figure()=plt.cm.bwr, extent=[0, 24, 0, 6],
plt.imshow(my_pred.values, cmap='auto', interpolation='bilinear') # compare with interpolation=None
aspect'hour')
plt.xlabel('day')
plt.ylabel(; plt.colorbar()
= my_df[['hour','day']].values
X X
array([[1, 6],
[1, 5],
[1, 4],
...,
[24, 1],
[24, 0],
[24, 6]], dtype=object)
= my_df['above'].values
y y
array([0, 1, 0, ..., 0, 0, 0])
= train_test_split(X, y, random_state=10) X_train, X_test, y_train, y_test
= DecisionTreeClassifier(max_depth=3)
algo = RandomForestClassifier(n_estimators=5, random_state=0)
algo = GradientBoostingClassifier(learning_rate=0.1, n_estimators=35)
algo
algo.fit(X_train, y_train)
print(f"Accuracy on training set: {algo.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {algo.score(X_test, y_test):.3f}")
Accuracy on training set: 0.809
Accuracy on test set: 0.811
def plot_tree_partition(X, y, tree):
= 0.5
eps = X[:, 0].min() - eps, X[:, 0].max() + eps
x_min, x_max = X[:, 1].min() - eps, X[:, 1].max() + eps
y_min, y_max = np.linspace(x_min, x_max, 1000)
xx = np.linspace(y_min, y_max, 1000)
yy = np.meshgrid(xx, yy)
X1, X2 = np.column_stack((X1.ravel(), X2.ravel()))
X_grid = tree.predict(X_grid)
Z = Z.reshape(X1.shape)
Z =.4, levels=[0, .5, 1])
plt.contourf(X1, X2, Z, alpha# mglearn.discrete_scatter(X[:, 0], X[:, 1], y, alpha=0.1)
return
plt.figure()
plot_tree_partition(X, y, algo)'hour')
plt.xlabel('day')
plt.ylabel(True); plt.grid(
Principal Component Analysis
Übung 2: Prognose mit/ohne Feature Reduktion
Laden Sie die Datei ETW.xlsx
in ein pandas DataFrame.
- Erstellen Sie einen Scatter-Matrix-Plot der Daten.
- Stellen Sie das Target
AM
in Abhängigkeit des FeaturesMathematik
in einer Scatter-Matrix dar. Fitten Sie eine Gerade durch die Datenpunkte. - Berechnen Sie alle Korrelationen.
- Erstellen und bewerten Sie mehrere Modelle zur Prognose der Spalte
AM
aus den anderen Spalten. - Verwenden Sie PCA zur Prognose der Spalte
AM
aus den ersten zwei Hauptkomponenten. Stellen Sie die Ergebnisse grafisch dar.
Lösung:
= pd.read_excel("daten/ETW.xlsx", index_col=0)
df 3) df.head(
Interview | Mathematik | Physik | Chemie | AM | |
---|---|---|---|---|---|
0 | 0.860 | 0.391304 | 0.500000 | 0.6000 | 58.750 |
1 | 0.885 | 0.586957 | 0.095238 | 0.1250 | 57.600 |
2 | 0.750 | 0.739130 | 0.714286 | 0.3875 | 63.705 |
=pd.plotting.scatter_matrix(df,
sm=(15, 12), marker='.', diagonal='kde', density_kwds={'color':'black'},
figsize=30, alpha=.8,color='red',
s=plt.get_cmap('coolwarm'))
cmap
#y ticklabels
'size', 14) for item in sm.ravel()]
[plt.setp(item.yaxis.get_majorticklabels(), #x ticklabels
'size', 14,'rotation',0) for item in sm.ravel()]
[plt.setp(item.xaxis.get_majorticklabels(), #y labels
'size', 14, 'rotation',0,'ha','right') for item in sm.ravel()]
[plt.setp(item.yaxis.get_label(), #x labels
'size',14) for item in sm.ravel()]
[plt.setp(item.xaxis.get_label(),
plt.tight_layout()False) plt.grid(
/usr/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:97: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
ax.scatter(
=np.polyfit(df.Mathematik*100,df.AM,1)
coefs=np.arange(0,100.1,1)
t=coefs[0]*t+coefs[1]
y
plt.figure()*100,df.AM,color='red')
plt.scatter(df.Mathematik=2,ls='solid',color='black')
plt.plot(t,y,lw'Mathematik Aufnahmetest (%)')
plt.xlabel('Angewandte Mathematik Prüfung (%)')
plt.ylabel(0,100)
plt.ylim(0,100.1,20))
plt.yticks(np.arange( plt.tight_layout()
df.corr()
Interview | Mathematik | Physik | Chemie | AM | |
---|---|---|---|---|---|
Interview | 1.000000 | 0.162706 | 0.314272 | 0.107939 | 0.271260 |
Mathematik | 0.162706 | 1.000000 | 0.360859 | 0.231434 | 0.552820 |
Physik | 0.314272 | 0.360859 | 1.000000 | 0.434040 | 0.336828 |
Chemie | 0.107939 | 0.231434 | 0.434040 | 1.000000 | 0.061733 |
AM | 0.271260 | 0.552820 | 0.336828 | 0.061733 | 1.000000 |
= ["Interview", "Mathematik", "Physik", "Chemie"]
features # features = ["Mathematik", "Physik"]
= df[features].values
X = df.AM.values y
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=None)
= train_test_split(X, y, random_state=0)
X_train, X_test, y_train, y_test
= KNeighborsRegressor(n_neighbors=10)
algo = LinearRegression()
algo # algo = Ridge(alpha=.1)
# algo = Lasso(alpha=.25)
algo.fit(X_train, y_train)
print("train score: {:.2f}".format(algo.score(X_train, y_train)))
print("test score: {:.2f}".format(algo.score(X_test, y_test)))
train score: 0.35
test score: 0.31
algo.intercept_
31.491830606311968
algo.coef_
array([ 15.34341359, 39.10870341, 18.66945353, -14.64473302])
=(8,5))
plt.figure(figsize
plt.stem(np.concatenate( ([algo.intercept_], algo.coef_) ))'feature')
plt.xlabel(range(len(features) + 1), ['intercept'] + features)
plt.xticks('regression coefficient')
plt.ylabel(True) plt.grid(
from sklearn.decomposition import PCA
= PCA(n_components=2)
pca
pca.fit(X)
= pca.transform(X)
X_pca
print("Original shape: {}".format(str(X.shape)))
print("Reduced shape : {}".format(str(X_pca.shape)))
Original shape: (65, 4)
Reduced shape : (65, 2)
# plot fist vs second principal component, color by class:
=(7,7))
plt.figure(figsize0], X_pca[:, 1])
plt.scatter(X_pca[:, "First principal component")
plt.xlabel("Second principal component")
plt.ylabel(True) plt.grid(
='viridis')
plt.matshow(pca.components_, cmap0, 1], ["First component", "Second component"])
plt.yticks([
plt.colorbar()range(len(features)), features, rotation=60, ha='left')
plt.xticks("Feature")
plt.xlabel("Principal components");
plt.ylabel(
pca.components_
array([[ 0.10098801, 0.53178411, 0.55359898, 0.63287854],
[ 0.03665652, 0.77467872, -0.02995449, -0.63058061]])
= train_test_split(X_pca, y, random_state=0)
X_train, X_test, y_train, y_test
= KNeighborsRegressor(n_neighbors=10)
algo = LinearRegression()
algo #algo = Ridge(alpha=1)
#algo = Lasso(alpha=.5)
algo.fit(X_train, y_train)
print("train score: {:.2f}".format(algo.score(X_train, y_train)))
print("test score: {:.2f}".format(algo.score(X_test, y_test)))
#algo.coef_
train score: 0.34
test score: 0.28