Regresion con Scikit Learn

Por Jose R. Zapata


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datetime

Informacion de los datos

automobile_df = pd.read_csv('auto-mpg.csv')

# Ver 5 registros aleatorios
automobile_df.sample(5)

mpg cylinders displacement horsepower weight acceleration model year origin car name
136 16.0 8 302.0 140 4141 14.0 74 1 ford gran torino
155 15.0 6 250.0 72 3158 19.5 75 1 ford maverick
156 16.0 8 400.0 170 4668 11.5 75 1 pontiac catalina
189 15.5 8 304.0 120 3962 13.9 76 1 amc matador
331 33.8 4 97.0 67 2145 18.0 80 3 subaru dl
#Tamaño del dataset
automobile_df.shape 
(398, 9)
automobile_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB

Preparacion de Datos

automobile_df = automobile_df.replace('?', np.nan)
automobile_df = automobile_df.dropna()
automobile_df.shape
(392, 9)

Eliminar Columnas no necesarias

automobile_df.drop(['origin', 'car name'], axis=1, inplace=True)
automobile_df.sample(5)

mpg cylinders displacement horsepower weight acceleration model year
359 28.1 4 141.0 80 3230 20.4 81
131 32.0 4 71.0 65 1836 21.0 74
257 19.4 6 232.0 90 3210 17.2 78
174 18.0 6 171.0 97 2984 14.5 75
188 16.0 8 318.0 150 4190 13.0 76
automobile_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 24.5+ KB

Convertir el formato de ‘model year’ a año completo

automobile_df['model year'] = '19' + automobile_df['model year'].astype(str)
automobile_df.sample(5)

mpg cylinders displacement horsepower weight acceleration model year
144 31.0 4 76.0 52 1649 16.5 1974
231 15.5 8 400.0 190 4325 12.2 1977
154 15.0 6 250.0 72 3432 21.0 1975
169 20.0 6 232.0 100 2914 16.0 1975
116 16.0 8 400.0 230 4278 9.5 1973

Agregar columna de los años del automobil

automobile_df['age'] = datetime.datetime.now().year - pd.to_numeric(automobile_df['model year'])
automobile_df.drop(['model year'], axis=1, inplace=True)
automobile_df.sample(5)

mpg cylinders displacement horsepower weight acceleration age
18 27.0 4 97.0 88 2130 14.5 51
314 26.4 4 140.0 88 2870 18.1 41
380 36.0 4 120.0 88 2160 14.5 39
16 18.0 6 199.0 97 2774 15.5 51
276 21.6 4 121.0 115 2795 15.7 43
automobile_df.dtypes
mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
age               int64
dtype: object
automobile_df['horsepower'] = pd.to_numeric(automobile_df['horsepower'], errors='coerce')
automobile_df.describe()

mpg cylinders displacement horsepower weight acceleration age
count 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000
mean 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327 45.020408
std 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864 3.683737
min 9.000000 3.000000 68.000000 46.000000 1613.000000 8.000000 39.000000
25% 17.000000 4.000000 105.000000 75.000000 2225.250000 13.775000 42.000000
50% 22.750000 4.000000 151.000000 93.500000 2803.500000 15.500000 45.000000
75% 29.000000 8.000000 275.750000 126.000000 3614.750000 17.025000 48.000000
max 46.600000 8.000000 455.000000 230.000000 5140.000000 24.800000 51.000000

Analisis Univariable

Se debe hacer un analisis de cada una de las variables y describir sus caracteristicas.

Analisis Bivariable

Scatter Plots

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(automobile_df['age'], automobile_df['mpg'])

plt.xlabel('Años')
plt.ylabel('Millas por galon');


png

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(automobile_df['acceleration'], automobile_df['mpg'])

plt.xlabel('Aceleracion')
plt.ylabel('Millas por galon');


png

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(automobile_df['weight'], automobile_df['mpg'])

plt.xlabel('Peso')
plt.ylabel('Millas por galon');


png

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(automobile_df['displacement'], automobile_df['mpg'])

plt.xlabel('Desplazamiento')
plt.ylabel('Millas por galon');


png

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(automobile_df['horsepower'], automobile_df['mpg'])

plt.xlabel('Caballos de fuerza')
plt.ylabel('Millas por galon');


png

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(automobile_df['cylinders'], automobile_df['mpg'])

plt.xlabel('Cilindros')
plt.ylabel('Millas por galon');


png

Correlacion

automobile_corr = automobile_df.corr()

automobile_corr

mpg cylinders displacement horsepower weight acceleration age
mpg 1.000000 -0.777618 -0.805127 -0.778427 -0.832244 0.423329 -0.580541
cylinders -0.777618 1.000000 0.950823 0.842983 0.897527 -0.504683 0.345647
displacement -0.805127 0.950823 1.000000 0.897257 0.932994 -0.543800 0.369855
horsepower -0.778427 0.842983 0.897257 1.000000 0.864538 -0.689196 0.416361
weight -0.832244 0.897527 0.932994 0.864538 1.000000 -0.416839 0.309120
acceleration 0.423329 -0.504683 -0.543800 -0.689196 -0.416839 1.000000 -0.290316
age -0.580541 0.345647 0.369855 0.416361 0.309120 -0.290316 1.000000
fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(automobile_corr, annot=True);


png

automobile_df = automobile_df.sample(frac=1).reset_index(drop=True)

automobile_df.head()

mpg cylinders displacement horsepower weight acceleration age
0 16.5 6 168.0 120 3820 16.7 45
1 25.0 4 116.0 81 2220 16.9 45
2 16.0 8 318.0 150 4498 14.5 46
3 12.0 8 429.0 198 4952 11.5 48
4 31.3 4 120.0 75 2542 17.5 41
automobile_df.to_csv('auto-mpg-processed.csv', index=False)

Regresion Lineal

Regresion lineal con una caracteristica (horsepower)

from sklearn.model_selection import train_test_split

X = automobile_df[['horsepower']]
Y = automobile_df['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) #20%
x_train.sample(5)

horsepower
23 70
348 153
357 72
366 100
74 97
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(normalize=True).fit(x_train, y_train)
print('Puntaje Entrenamiento: ', linear_model.score(x_train, y_train))
Puntaje Entrenamiento:  0.6013261684165121
y_pred = linear_model.predict(x_test)
from sklearn.metrics import r2_score

print('Puntaje Testing: ', r2_score(y_test, y_pred))
Puntaje Testing:  0.6173981579692112
fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(x_test, y_test)
plt.plot(x_test, y_pred, color='r')

plt.xlabel('Caballos de Fuerza')
plt.ylabel('Mpg')
plt.show()


png

Regresion lineal con una caracteristica - age

X = automobile_df[['age']]
Y = automobile_df['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

print('Puntaje de entrenamiento: ', linear_model.score(x_train, y_train))

y_pred = linear_model.predict(x_test)

print('Puntaje de Testing: ', r2_score(y_test, y_pred))
Puntaje de entrenamiento:  0.3151398209833455
Puntaje de Testing:  0.41035823800038795
fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(x_test, y_test)
plt.plot(x_test, y_pred, color='r')

plt.xlabel('Age')
plt.ylabel('Mpg')
plt.show()


png

Regresion lineal con varias caracteristicas

# X = automobile_df[['displacement', 'horsepower', 'weight', 'acceleration', 'cylinders']]

X = automobile_df[['displacement', 'horsepower', 'weight']]
Y = automobile_df['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)
print('Training score: ', linear_model.score(x_train, y_train))
Training score:  0.6935339515300342
predictors = x_train.columns
coef = pd.Series(linear_model.coef_, predictors).sort_values()

print(coef)
horsepower     -0.044184
displacement   -0.010151
weight         -0.004635
dtype: float64
y_pred = linear_model.predict(x_test)
print('Puntaje Testing', r2_score(y_test, y_pred))
Puntaje Testing 0.7335785739591847
plt.figure(figsize = (20,10))

plt.plot(y_pred, label='Prediccion')
plt.plot(y_test.values, label='Real')

plt.ylabel('Mpg')

plt.legend()
plt.show()


png

Regresión con Múltiples Modelos

Si se tienen muchos modelos realizar cross validation con todos es costoso computacionalmente y en tiempo, entonces se debe ir descartando los modelos con menos desempeño hasta llegar al modelo final

  • Inicialmente se dividen los datos en una parte para realizar la seleccion del modelo (Model selection dataset) y otra para realizar la prueba de desempeño (Performance dataset, esta parte de los datos se debe usar solo en el final de todo el proceso)
  • Primer paso es dividir el Model selection dataset en una parte de entrenamiento (train) y otra de prueba (test), normalmente en una proporcion de 80/20 o 70/30.
  • Hacer una evaluacion de todos los modelos la division enterior y seleccionar los mejores (preferiblemente que tengan un principio de funcionamiento diferente entre ellos).
  • Con lo mejores modelos (la cantidad depende de que tan parecido es su resultado) realizar un cross-validation (detectar si hay over-fitting) para obtener los que tengan mejor resultado.
  • Se saca el mejor o los mejores modelos (Los que tengan mejor desempeño y poca varianza en sus resultados) y se realiza optimizacion de hiper parametros (Hyper parameter Tunning). Este proceso es costoso computacionalmente por eso se debe realizar con muy pocos modelos.
  • Luego se selecciona el mejor modelo (Mejor desempeño y poca varianza) y se obtienen los hiper parametros que dieron el mejor resultado.
  • Finalmente se entrena el modelo seleccionado con los hiper-parametros hallados sobre el Model selection dataset y se hace la prueba final con el Performance dataset
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings("ignore")
automobile_df = pd.read_csv('auto-mpg-processed.csv')

automobile_df.head()

mpg cylinders displacement horsepower weight acceleration age
0 16.5 6 168.0 120 3820 16.7 45
1 25.0 4 116.0 81 2220 16.9 45
2 16.0 8 318.0 150 4498 14.5 46
3 12.0 8 429.0 198 4952 11.5 48
4 31.3 4 120.0 75 2542 17.5 41
result_dict = {}

Funciones de ayuda

def build_model(regression_fn,                
                name_of_y_col, 
                names_of_x_cols, 
                dataset, 
                test_frac=0.2,
                preprocess_fn=None):
    
    """build_model
    
    Funcion para entrenar y evaluar un modelo
        
    """
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    if preprocess_fn is not None:
        X = preprocess_fn(X)

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    
    model = regression_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print("Entrenamiento_score : " , model.score(x_train, y_train))
    print("Prueba_score : ", r2_score(y_test, y_pred))

    return {
            'Entrenamiento_score': model.score(x_train, y_train),
            'Prueba_score': r2_score(y_test, y_pred)
           }     
def compare_results():
    for key in result_dict:
        print('Regresion: ', key)
        print('Entrenamiento score', result_dict[key]['Entrenamiento_score'])
        print('Prueba score', result_dict[key]['Prueba_score'])
        print()

Regresion lineal

def linear_reg(x_train, y_train):
    model = LinearRegression(normalize=True)
    model.fit(x_train, y_train)
    
    return model
result_dict['mpg ~ single_linear'] = build_model(linear_reg,
                                                 'mpg',
                                                ['weight'],
                                                  automobile_df)
Entrenamiento_score :  0.6696115274238652
Prueba_score :  0.7704980985444312
result_dict['mpg ~ kitchen_sink_linear'] = build_model(linear_reg,
                                                      'mpg',
                                                     ['cylinders',
                                                      'displacement',
                                                      'horsepower',
                                                      'weight',
                                                      'acceleration'],
                                                      automobile_df)
Entrenamiento_score :  0.7074657620419418
Prueba_score :  0.6989622365211952
result_dict['mpg ~ parsimonius_linear'] = build_model(linear_reg,
                                                      'mpg',
                                                    ['horsepower',
                                                     'weight'],
                                                      automobile_df)
Entrenamiento_score :  0.7013230352668707
Prueba_score :  0.7232115758741458
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Lasso

def lasso_reg(x_train, y_train, alpha=0.5):
    model = Lasso(alpha=alpha)
    model.fit(x_train, y_train)
    
    return model
result_dict['mpg ~ kitchen_sink_lasso'] = build_model(lasso_reg,
                                                     'mpg',
                                                    ['cylinders',
                                                     'displacement',
                                                     'horsepower',
                                                     'weight',
                                                     'acceleration'],
                                                      automobile_df)
Entrenamiento_score :  0.6943200981996365
Prueba_score :  0.7366916237596017
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Ridge

def ridge_reg(x_train, y_train, alpha=0.5, normalize=True):
    model = Ridge(alpha=alpha, normalize=normalize)
    model.fit(x_train, y_train)
    
    return model
result_dict['mpg ~ kitchen_sink_ridge'] = build_model(ridge_reg,
                                                      'mpg',
                                                     ['cylinders',
                                                      'displacement',
                                                      'horsepower',
                                                      'weight',
                                                      'acceleration'],
                                                       automobile_df)
Entrenamiento_score :  0.6890878092067418
Prueba_score :  0.6560571888322049
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Regresion:  mpg ~ kitchen_sink_ridge
Entrenamiento score 0.6890878092067418
Prueba score 0.6560571888322049

Elasticnet

def elastic_net_reg(x_train, y_train, alpha=1, l1_ratio=0.5, 
                    normalize=False, max_iter=100000, warm_start=True, equivalent_to="Elastic Net"):

    print("Equivalent to:", equivalent_to)
    
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, 
                       normalize=normalize, max_iter=max_iter, warm_start=warm_start)
    model.fit(x_train, y_train)

    return model
from functools import partial

## This generates a warning which says will not converge
result_dict['mpg ~ kitchen_sink_elastic_net_ols'] = build_model(partial(elastic_net_reg, 
                                                                        alpha=0, equivalent_to="OLS"),
                                                               'mpg',
                                                              ['cylinders',
                                                               'displacement',
                                                               'horsepower',
                                                               'weight',
                                                               'acceleration'],
                                                                automobile_df)
Equivalent to: OLS
Entrenamiento_score :  0.7286154402790895
Prueba_score :  0.6261044691205849
result_dict['mpg ~ kitchen_sink_elastic_net_lasso'] = build_model(partial(elastic_net_reg, alpha=1, 
                                                                    l1_ratio=0, equivalent_to="Lasso"),
                                                                  'mpg',
                                                                 ['cylinders',
                                                                  'displacement',
                                                                  'horsepower',
                                                                  'weight',
                                                                  'acceleration'],
                                                                   automobile_df)
Equivalent to: Lasso
Entrenamiento_score :  0.7065392087832174
Prueba_score :  0.697481690838238
result_dict['mpg ~ kitchen_sink_elastic_net_ridge'] = build_model(partial(elastic_net_reg, alpha=1, 
                                                                    l1_ratio=1, equivalent_to="Ridge"),
                                                                  'mpg',
                                                                 ['cylinders',
                                                                  'displacement',
                                                                  'horsepower',
                                                                  'weight',
                                                                  'acceleration'],
                                                                   automobile_df)
Equivalent to: Ridge
Entrenamiento_score :  0.7197200945905666
Prueba_score :  0.6454151082422006
result_dict['mpg ~ kitchen_sink_elastic_net'] = build_model(partial(elastic_net_reg, alpha=1, l1_ratio=0.5),
                                                            'mpg',
                                                          [ 'cylinders',
                                                            'displacement',
                                                            'horsepower',
                                                            'weight',
                                                            'acceleration'],
                                                            automobile_df)
Equivalent to: Elastic Net
Entrenamiento_score :  0.7137100772862914
Prueba_score :  0.6575033081647761
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Regresion:  mpg ~ kitchen_sink_ridge
Entrenamiento score 0.6890878092067418
Prueba score 0.6560571888322049

Regresion:  mpg ~ kitchen_sink_elastic_net_ols
Entrenamiento score 0.7286154402790895
Prueba score 0.6261044691205849

Regresion:  mpg ~ kitchen_sink_elastic_net_lasso
Entrenamiento score 0.7065392087832174
Prueba score 0.697481690838238

Regresion:  mpg ~ kitchen_sink_elastic_net_ridge
Entrenamiento score 0.7197200945905666
Prueba score 0.6454151082422006

Regresion:  mpg ~ kitchen_sink_elastic_net
Entrenamiento score 0.7137100772862914
Prueba score 0.6575033081647761

SVR

For SVR regression with larger datasets this alternate implementations is preferred

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html#sklearn.svm.LinearSVR

  • Uses a different library for implementation
  • More flexibility with choice of penalties
  • Scales to larger datasets
def svr_reg(x_train, y_train, kernel='linear', epsilon=0.05, C=0.3):
    model = SVR(kernel=kernel, epsilon=epsilon, C=C)
    model.fit(x_train,y_train)
    
    return model
result_dict['mpg ~ kitchen_sink_svr'] = build_model(svr_reg,
                                                    'mpg',
                                                   ['cylinders',
                                                    'displacement',
                                                    'horsepower',
                                                    'weight',
                                                    'acceleration'],
                                                     automobile_df)
Entrenamiento_score :  0.6948101242037901
Prueba_score :  0.7240248634465822
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Regresion:  mpg ~ kitchen_sink_ridge
Entrenamiento score 0.6890878092067418
Prueba score 0.6560571888322049

Regresion:  mpg ~ kitchen_sink_elastic_net_ols
Entrenamiento score 0.7286154402790895
Prueba score 0.6261044691205849

Regresion:  mpg ~ kitchen_sink_elastic_net_lasso
Entrenamiento score 0.7065392087832174
Prueba score 0.697481690838238

Regresion:  mpg ~ kitchen_sink_elastic_net_ridge
Entrenamiento score 0.7197200945905666
Prueba score 0.6454151082422006

Regresion:  mpg ~ kitchen_sink_elastic_net
Entrenamiento score 0.7137100772862914
Prueba score 0.6575033081647761

Regresion:  mpg ~ kitchen_sink_svr
Entrenamiento score 0.6948101242037901
Prueba score 0.7240248634465822

KNR

def kneighbors_reg(x_train, y_train, n_neighbors=10):
    model = KNeighborsRegressor(n_neighbors=n_neighbors)
    model.fit(x_train, y_train)
    
    return model
result_dict['mpg ~ kitchen_sink_kneighbors'] = build_model(kneighbors_reg,
                                                           'mpg',
                                                          ['cylinders',
                                                           'displacement',
                                                           'horsepower',
                                                           'weight',
                                                           'acceleration'],
                                                           automobile_df)
Entrenamiento_score :  0.7638870632761847
Prueba_score :  0.6855412306938089
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Regresion:  mpg ~ kitchen_sink_ridge
Entrenamiento score 0.6890878092067418
Prueba score 0.6560571888322049

Regresion:  mpg ~ kitchen_sink_elastic_net_ols
Entrenamiento score 0.7286154402790895
Prueba score 0.6261044691205849

Regresion:  mpg ~ kitchen_sink_elastic_net_lasso
Entrenamiento score 0.7065392087832174
Prueba score 0.697481690838238

Regresion:  mpg ~ kitchen_sink_elastic_net_ridge
Entrenamiento score 0.7197200945905666
Prueba score 0.6454151082422006

Regresion:  mpg ~ kitchen_sink_elastic_net
Entrenamiento score 0.7137100772862914
Prueba score 0.6575033081647761

Regresion:  mpg ~ kitchen_sink_svr
Entrenamiento score 0.6948101242037901
Prueba score 0.7240248634465822

Regresion:  mpg ~ kitchen_sink_kneighbors
Entrenamiento score 0.7638870632761847
Prueba score 0.6855412306938089

SGD

def apply_standard_scaler(x):
    scaler = StandardScaler()
    scaler.fit(x)  

    return scaler.transform(x)
def sgd_reg(x_train, y_train, max_iter=10000, tol=1e-3):
    model = SGDRegressor(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model
result_dict['mpg ~ kitchen_sink_sgd'] = build_model(sgd_reg,
                                                   'mpg',
                                                  ['cylinders',
                                                   'displacement',
                                                   'horsepower',
                                                   'weight',
                                                   'acceleration'],
                                                    automobile_df,
                                                    preprocess_fn=apply_standard_scaler)
Entrenamiento_score :  0.7210212471051309
Prueba_score :  0.6388005372824846
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Regresion:  mpg ~ kitchen_sink_ridge
Entrenamiento score 0.6890878092067418
Prueba score 0.6560571888322049

Regresion:  mpg ~ kitchen_sink_elastic_net_ols
Entrenamiento score 0.7286154402790895
Prueba score 0.6261044691205849

Regresion:  mpg ~ kitchen_sink_elastic_net_lasso
Entrenamiento score 0.7065392087832174
Prueba score 0.697481690838238

Regresion:  mpg ~ kitchen_sink_elastic_net_ridge
Entrenamiento score 0.7197200945905666
Prueba score 0.6454151082422006

Regresion:  mpg ~ kitchen_sink_elastic_net
Entrenamiento score 0.7137100772862914
Prueba score 0.6575033081647761

Regresion:  mpg ~ kitchen_sink_svr
Entrenamiento score 0.6948101242037901
Prueba score 0.7240248634465822

Regresion:  mpg ~ kitchen_sink_kneighbors
Entrenamiento score 0.7638870632761847
Prueba score 0.6855412306938089

Regresion:  mpg ~ kitchen_sink_sgd
Entrenamiento score 0.7210212471051309
Prueba score 0.6388005372824846

Decision Tree

def decision_tree_reg(x_train, y_train, max_depth=2):
    model = DecisionTreeRegressor(max_depth=max_depth)
    model.fit(x_train, y_train)
    
    return model
result_dict['mpg ~ kitchen_sink_decision_tree'] = build_model(decision_tree_reg,
                                                             'mpg',
                                                            ['cylinders',
                                                             'displacement',
                                                             'horsepower',
                                                             'weight',
                                                             'acceleration'],
                                                              automobile_df)
Entrenamiento_score :  0.7302512814978837
Prueba_score :  0.6695729153590095
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Regresion:  mpg ~ kitchen_sink_ridge
Entrenamiento score 0.6890878092067418
Prueba score 0.6560571888322049

Regresion:  mpg ~ kitchen_sink_elastic_net_ols
Entrenamiento score 0.7286154402790895
Prueba score 0.6261044691205849

Regresion:  mpg ~ kitchen_sink_elastic_net_lasso
Entrenamiento score 0.7065392087832174
Prueba score 0.697481690838238

Regresion:  mpg ~ kitchen_sink_elastic_net_ridge
Entrenamiento score 0.7197200945905666
Prueba score 0.6454151082422006

Regresion:  mpg ~ kitchen_sink_elastic_net
Entrenamiento score 0.7137100772862914
Prueba score 0.6575033081647761

Regresion:  mpg ~ kitchen_sink_svr
Entrenamiento score 0.6948101242037901
Prueba score 0.7240248634465822

Regresion:  mpg ~ kitchen_sink_kneighbors
Entrenamiento score 0.7638870632761847
Prueba score 0.6855412306938089

Regresion:  mpg ~ kitchen_sink_sgd
Entrenamiento score 0.7210212471051309
Prueba score 0.6388005372824846

Regresion:  mpg ~ kitchen_sink_decision_tree
Entrenamiento score 0.7302512814978837
Prueba score 0.6695729153590095

Lars

def lars_reg(x_train, y_train, n_nonzero_coefs=4):
    model = Lars(n_nonzero_coefs=n_nonzero_coefs)
    model.fit(x_train, y_train)
    
    return model
result_dict['mpg ~ kitchen_sink_lars'] = build_model(lars_reg,
                                                    'mpg',
                                                   ['cylinders',
                                                    'displacement',
                                                    'horsepower',
                                                    'weight',
                                                    'acceleration'],
                                                     automobile_df)
Entrenamiento_score :  0.7221519955627607
Prueba_score :  0.6442421991887889
compare_results()
Regresion:  mpg ~ single_linear
Entrenamiento score 0.6696115274238652
Prueba score 0.7704980985444312

Regresion:  mpg ~ kitchen_sink_linear
Entrenamiento score 0.7074657620419418
Prueba score 0.6989622365211952

Regresion:  mpg ~ parsimonius_linear
Entrenamiento score 0.7013230352668707
Prueba score 0.7232115758741458

Regresion:  mpg ~ kitchen_sink_lasso
Entrenamiento score 0.6943200981996365
Prueba score 0.7366916237596017

Regresion:  mpg ~ kitchen_sink_ridge
Entrenamiento score 0.6890878092067418
Prueba score 0.6560571888322049

Regresion:  mpg ~ kitchen_sink_elastic_net_ols
Entrenamiento score 0.7286154402790895
Prueba score 0.6261044691205849

Regresion:  mpg ~ kitchen_sink_elastic_net_lasso
Entrenamiento score 0.7065392087832174
Prueba score 0.697481690838238

Regresion:  mpg ~ kitchen_sink_elastic_net_ridge
Entrenamiento score 0.7197200945905666
Prueba score 0.6454151082422006

Regresion:  mpg ~ kitchen_sink_elastic_net
Entrenamiento score 0.7137100772862914
Prueba score 0.6575033081647761

Regresion:  mpg ~ kitchen_sink_svr
Entrenamiento score 0.6948101242037901
Prueba score 0.7240248634465822

Regresion:  mpg ~ kitchen_sink_kneighbors
Entrenamiento score 0.7638870632761847
Prueba score 0.6855412306938089

Regresion:  mpg ~ kitchen_sink_sgd
Entrenamiento score 0.7210212471051309
Prueba score 0.6388005372824846

Regresion:  mpg ~ kitchen_sink_decision_tree
Entrenamiento score 0.7302512814978837
Prueba score 0.6695729153590095

Regresion:  mpg ~ kitchen_sink_lars
Entrenamiento score 0.7221519955627607
Prueba score 0.6442421991887889
# Crear un diccionario solo con los resultados de prueba de cada modelo
nombre_modelos = result_dict.keys()
resultados_prueba = {} # crear diccionario vacio
for nombre in nombre_modelos:
    resultados_prueba[nombre] = result_dict[nombre]['Prueba_score']
plt.figure(figsize = (12,10)) # tamaño de la figura
plt.barh(range(len(resultados_prueba)), list(resultados_prueba.values()), 
        align='center', );
plt.title("Resultados en el dataset de pruebas de cada modelo")
plt.yticks(range(len(resultados_prueba)), list(resultados_prueba.keys()));


png

Cross Validation - Seleccion de Modelos

Analizar la varianza de los resultados para obtener los que tengan mejor resultado.

# lista para almacenar cada uno los modelos seleccionados para el cross validation
models = []

# Alamcenando los modelos como una tupla (nombre, modelo)
models.append(('kitchen_sink_linear',LinearRegression(normalize=True)))
models.append(('kitchen_sink_lasso',Lasso(alpha=0.5)))
models.append(('kitchen_sink_elastic_net',ElasticNet(alpha=1, l1_ratio=0.5,
                                                     normalize = False, 
                                                    max_iter= 100000, 
                                                    warm_start= True)))
models.append(('kitchen_sink_kneighbors',KNeighborsRegressor(n_neighbors=10)))
models.append(('kitchen_sink_decision_tree',DecisionTreeRegressor(max_depth=2)))
models.append(('kitchen_sink_svr',SVR(kernel='linear', epsilon=0.05, C=0.3)))
# Grabar los resultados de cada modelo
from sklearn import model_selection

#Semilla para obtener los mismos resultados de pruebas
seed = 2
results = []
names = []
scoring = 'r2'
for name, model in models:
    # Kfold cross validation for model selection
    kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=seed)
    #X train , y train
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = f"({name}, {cv_results.mean()}, {cv_results.std()}"
    print(msg)
(kitchen_sink_linear, 0.6844234904934009, 0.03159374424744247
(kitchen_sink_lasso, 0.6844633664989738, 0.03171821519792585
(kitchen_sink_elastic_net, 0.6844703085848355, 0.03172143884528286
(kitchen_sink_kneighbors, 0.6608036231338688, 0.040735571400253065
(kitchen_sink_decision_tree, 0.6570680524867771, 0.03375219816362394
(kitchen_sink_svr, 0.6702154382576372, 0.03214642285149907
plt.figure(figsize = (15,8)) 
result_df = pd.DataFrame(results, index=names).T
result_df.boxplot()
plt.title("Resultados de Cross Validation");


png

Hyper Parameter Tunning

Optimizacion de hiperparametros, Se seleccionan los mejores modelos que tengan diferente formas de funcionamiento.

from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")
automobile_df = pd.read_csv('auto-mpg-processed.csv')

automobile_df.head()

mpg cylinders displacement horsepower weight acceleration age
0 16.5 6 168.0 120 3820 16.7 45
1 25.0 4 116.0 81 2220 16.9 45
2 16.0 8 318.0 150 4498 14.5 46
3 12.0 8 429.0 198 4952 11.5 48
4 31.3 4 120.0 75 2542 17.5 41
X = automobile_df.drop(['mpg', 'age'], axis=1)

Y = automobile_df['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

Lasso regression

parameters = {'alpha': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]}

grid_search = GridSearchCV(Lasso(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)
GridSearchCV(cv=3, estimator=Lasso(),
             param_grid={'alpha': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]},
             return_train_score=True)

Resultados de la hiperparametrizacion

print(f"Mejor resultado = {grid_search.best_score_}")
print(f"Mejor parametros = {grid_search.best_params_}")
Mejor resultado = 0.7021717010510476
Mejor parametros = {'alpha': 1.0}
# Para ver todos los resultados del cross validation
# No es necesario, solo es informativo para ver como varia el modelo
for i in range(len(parameters['alpha'])):
    print('Parametros: ', grid_search.cv_results_['params'][i])

    print('Promedio Score Prueba: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])
Parametros:  {'alpha': 0.2}
Promedio Score Prueba:  0.6969019568609475
Rank:  7
Parametros:  {'alpha': 0.4}
Promedio Score Prueba:  0.698541427433439
Rank:  6
Parametros:  {'alpha': 0.6}
Promedio Score Prueba:  0.699965209997786
Rank:  5
Parametros:  {'alpha': 0.7}
Promedio Score Prueba:  0.7005967051825271
Rank:  4
Parametros:  {'alpha': 0.8}
Promedio Score Prueba:  0.7011748241730046
Rank:  3
Parametros:  {'alpha': 0.9}
Promedio Score Prueba:  0.7016999274085981
Rank:  2
Parametros:  {'alpha': 1.0}
Promedio Score Prueba:  0.7021717010510476
Rank:  1
lasso_model = Lasso(alpha=grid_search.best_params_['alpha']).fit(x_train, y_train)
y_pred = lasso_model.predict(x_test)

print('Entrenamiento score: ', lasso_model.score(x_train, y_train))
print('Prueba score: ', r2_score(y_test, y_pred))
Entrenamiento score:  0.7182879921908456
Prueba score:  0.6653323457545108

KNeighbors regression

parameters = {'n_neighbors': [10, 12, 14, 18, 20, 25, 30, 35, 50]}

grid_search = GridSearchCV(KNeighborsRegressor(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)
GridSearchCV(cv=3, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [10, 12, 14, 18, 20, 25, 30, 35, 50]},
             return_train_score=True)
print(f"Mejor resultado = {grid_search.best_score_}")
print(f"Mejor parametros = {grid_search.best_params_}")
Mejor resultado = 0.7124428667389037
Mejor parametros = {'n_neighbors': 25}
kneighbors_model = KNeighborsRegressor(n_neighbors=grid_search.best_params_['n_neighbors']).fit(x_train, y_train)
y_pred = kneighbors_model.predict(x_test)

print('Entrenamiento score: ', kneighbors_model.score(x_train, y_train))
print('Prueba score: ', r2_score(y_test, y_pred))
Entrenamiento score:  0.7425582447137113
Prueba score:  0.657177469793524

Decision Tree

parameters = {'max_depth':[1, 2, 3, 4, 5, 7, 8]}

grid_search = GridSearchCV(DecisionTreeRegressor(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 7, 8]},
             return_train_score=True)
print(f"Mejor resultado = {grid_search.best_score_}")
print(f"Mejor parametros = {grid_search.best_params_}")
Mejor resultado = 0.6827011911186145
Mejor parametros = {'max_depth': 2}
decision_tree_model = DecisionTreeRegressor(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)
y_pred = decision_tree_model.predict(x_test)

print('Entrenamiento score: ', decision_tree_model.score(x_train, y_train))
print('Prueba score: ', r2_score(y_test, y_pred))
Entrenamiento score:  0.7447696129290833
Prueba score:  0.6806583570617913

SVR

parameters = {'epsilon': [0.05, 0.1, 0.2, 0.3],
              'C': [0.2, 0.3]}

grid_search = GridSearchCV(SVR(kernel='linear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=SVR(kernel='linear'),
             param_grid={'C': [0.2, 0.3], 'epsilon': [0.05, 0.1, 0.2, 0.3]},
             return_train_score=True)
print(f"Mejor resultado = {grid_search.best_score_}")
print(f"Mejor parametros = {grid_search.best_params_}")
Mejor resultado = 0.6899470090629146
Mejor parametros = {'C': 0.2, 'epsilon': 0.3}

Como ejercicio academico se estima el resultado con los datos de test

svr_model = SVR(kernel='linear',
                epsilon=grid_search.best_params_['epsilon'], 
                C=grid_search.best_params_['C']).fit(x_train, y_train)
y_pred = svr_model.predict(x_test)

print('Entrenamiento score: ', svr_model.score(x_train, y_train))
print('Prueba score: ', r2_score(y_test, y_pred))
Entrenamiento score:  0.7106827209217679
Prueba score:  0.6134432209981653

Grabar el Modelo

Se Selecciona el modelo que obtuvo mejores resultados en la hiperparametrizacion y sus resultados eran buenos con el dataset de prueba (test)

# Entrenar el modelo con todos los datos disponibles
kneighbors_model = KNeighborsRegressor(n_neighbors= 10 ).fit(X, Y)
from joblib import dump# libreria de serializacion

# grabar el modelo en un archivo
dump(kneighbors_model, 'kneighbors_model-auto_mpg.joblib')
['kneighbors_model-auto_mpg.joblib']

Usar el Modelo

import pandas as pd
from joblib import load 
modelo = load('kneighbors_model-auto_mpg.joblib')
modelo
KNeighborsRegressor(n_neighbors=10)
datos = pd.read_csv('auto-mpg-processed.csv')
datos.head()

mpg cylinders displacement horsepower weight acceleration age
0 16.5 6 168.0 120 3820 16.7 45
1 25.0 4 116.0 81 2220 16.9 45
2 16.0 8 318.0 150 4498 14.5 46
3 12.0 8 429.0 198 4952 11.5 48
4 31.3 4 120.0 75 2542 17.5 41
# tomar dos datos de entrada para realizar la prediccion
datos_prueba = datos.iloc[2:4,1:6]
datos_prueba

cylinders displacement horsepower weight acceleration
2 8 318.0 150 4498 14.5
3 8 429.0 198 4952 11.5
# resultados de predicion con el modelo
modelo.predict(datos_prueba)
array([13.85, 12.  ])

Phd. Jose R. Zapata