Clasificacion con Scikit Learn

Por Jose R. Zapata

Invitame a un Cafe

Importar librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Informacion de los Datos

Titanic dataset

Fuente: https://www.kaggle.com/francksylla/titanic-machine-learning-from-disaster

titanic_df = pd.read_csv("https://www.openml.org/data/get_csv/16826755/phpMYEkMl")
titanic_df.sample(10)

pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
914 3 0 Karlsson, Mr. Julius Konrad Eugen male 33 0 0 347465 7.8542 ? S ? ? ?
667 3 0 Barry, Miss. Julia female 27 0 0 330844 7.8792 ? Q ? ? New York, NY
395 2 1 Doling, Miss. Elsie female 18 0 1 231919 23 ? S ? ? Southampton
827 3 0 Goodwin, Master. William Frederick male 11 5 2 CA 2144 46.9 ? S ? ? Wiltshire, England Niagara Falls, NY
1033 3 1 Moss, Mr. Albert Johan male ? 0 0 312991 7.775 ? S B ? ?
780 3 1 Drapkin, Miss. Jennie female 23 0 0 SOTON/OQ 392083 8.05 ? S ? ? London New York, NY
254 1 1 Saalfeld, Mr. Adolphe male ? 0 0 19988 30.5 C106 S 3 ? Manchester, England
274 1 1 Spedden, Mr. Frederic Oakley male 45 1 1 16966 134.5 E34 C 3 ? Tuxedo Park, NY
94 1 1 Dodge, Master. Washington male 4 0 2 33638 81.8583 A34 S 5 ? San Francisco, CA
121 1 1 Frauenthal, Mrs. Henry William (Clara Heinshei... female ? 1 0 PC 17611 133.65 ? S 5 ? New York, NY
titanic_df.shape
(1309, 14)
titanic_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB

Preparacion de datos

Eliminar columnas no necesarias

titanic_df = titanic_df.drop(['boat', 'body', 'home.dest','name', 'ticket', 'cabin'], axis='columns')

titanic_df.head()

pclass survived sex age sibsp parch fare embarked
0 1 1 female 29 0 0 211.3375 S
1 1 1 male 0.9167 1 2 151.55 S
2 1 0 female 2 1 2 151.55 S
3 1 0 male 30 1 2 151.55 S
4 1 0 female 25 1 2 151.55 S
titanic_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pclass    1309 non-null   int64 
 1   survived  1309 non-null   int64 
 2   sex       1309 non-null   object
 3   age       1309 non-null   object
 4   sibsp     1309 non-null   int64 
 5   parch     1309 non-null   int64 
 6   fare      1309 non-null   object
 7   embarked  1309 non-null   object
dtypes: int64(4), object(4)
memory usage: 81.9+ KB

Tratamiento de datos nulos

# Contar el numero de datos nulos
titanic_df[titanic_df.isnull().any(axis='columns')].count()
pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64
titanic_df['age'].unique()
array(['29', '0.9167', '2', '30', '25', '48', '63', '39', '53', '71',
       '47', '18', '24', '26', '80', '?', '50', '32', '36', '37', '42',
       '19', '35', '28', '45', '40', '58', '22', '41', '44', '59', '60',
       '33', '17', '11', '14', '49', '76', '46', '27', '64', '55', '70',
       '38', '51', '31', '4', '54', '23', '43', '52', '16', '32.5', '21',
       '15', '65', '28.5', '45.5', '56', '13', '61', '34', '6', '57',
       '62', '67', '1', '12', '20', '0.8333', '8', '0.6667', '7', '3',
       '36.5', '18.5', '5', '66', '9', '0.75', '70.5', '22.5', '0.3333',
       '0.1667', '40.5', '10', '23.5', '34.5', '20.5', '30.5', '55.5',
       '38.5', '14.5', '24.5', '60.5', '74', '0.4167', '11.5', '26.5'],
      dtype=object)
titanic_df = titanic_df.replace('?',np.nan)
titanic_df[titanic_df.isnull().any(axis='columns')].count()
pclass      266
survived    266
sex         266
age           3
sibsp       266
parch       266
fare        265
embarked    264
dtype: int64

Los datos faltantes se deben procesar en base al analisis de cada variable, en esta excepcion solo vamos a probar las tecnicas de machine learning en clasificacion, entonces vamos a eliminar las filas con datos faltantes para demostrar el uso de scikit-learn, pero el tratamiento de los datos faltantes es importante y en un caso real no se debe saltar.


# eliminar las filas con datos nulos
titanic_df = titanic_df.dropna()
titanic_df.shape
(1043, 8)
titanic_df[titanic_df.isnull().any(axis='columns')].count()
pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

Descripcion estadistica

titanic_df.describe()

pclass survived sibsp parch
count 1043.000000 1043.000000 1043.000000 1043.000000
mean 2.209012 0.407478 0.504314 0.421860
std 0.840685 0.491601 0.913080 0.840655
min 1.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 0.000000 0.000000
50% 2.000000 0.000000 0.000000 0.000000
75% 3.000000 1.000000 1.000000 1.000000
max 3.000000 1.000000 8.000000 6.000000
titanic_df.head()

pclass survived sex age sibsp parch fare embarked
0 1 1 female 29 0 0 211.3375 S
1 1 1 male 0.9167 1 2 151.55 S
2 1 0 female 2 1 2 151.55 S
3 1 0 male 30 1 2 151.55 S
4 1 0 female 25 1 2 151.55 S

Analisis Univariable

Se debe hacer un analisis de cada una de las variables y describir sus caracteristicas

Analisis Bivariable

Scatter Plots

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(titanic_df['age'], titanic_df['survived'])

plt.xlabel('age')
plt.ylabel('survived');

png

fig, ax = plt.subplots(figsize=(12, 8))

plt.scatter(titanic_df['fare'], titanic_df['survived'])

plt.xlabel('fare')
plt.ylabel('survived');

png

Correlacion

pd.crosstab(titanic_df['sex'], titanic_df['survived'])

survived 0 1
sex
female 96 290
male 522 135
pd.crosstab(titanic_df['pclass'], titanic_df['survived'])

survived 0 1
pclass
1 103 179
2 146 115
3 369 131
titanic_data_corr = titanic_df.corr()

titanic_data_corr
/var/folders/td/0mp8bn7513d4z8y4hkm49zqw3y854m/T/ipykernel_52406/1701824440.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  titanic_data_corr = titanic_df.corr()

pclass survived sibsp parch
pclass 1.000000 -0.317737 0.046333 0.016342
survived -0.317737 1.000000 -0.011403 0.115436
sibsp 0.046333 -0.011403 1.000000 0.373960
parch 0.016342 0.115436 0.373960 1.000000
fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(titanic_data_corr, annot=True);

png

Transformacion de Variables

from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()
titanic_df['sex'] = label_encoding.fit_transform(titanic_df['sex'].astype(str))

titanic_df.head()

pclass survived sex age sibsp parch fare embarked
0 1 1 0 29 0 0 211.3375 S
1 1 1 1 0.9167 1 2 151.55 S
2 1 0 0 2 1 2 151.55 S
3 1 0 1 30 1 2 151.55 S
4 1 0 0 25 1 2 151.55 S
label_encoding.classes_
array(['female', 'male'], dtype=object)

C = Cherbourg, Q = Queenstown, S = Southampton

titanic_df = pd.get_dummies(titanic_df, columns=['embarked'])

titanic_df.head()

pclass survived sex age sibsp parch fare embarked_C embarked_Q embarked_S
0 1 1 0 29 0 0 211.3375 0 0 1
1 1 1 1 0.9167 1 2 151.55 0 0 1
2 1 0 0 2 1 2 151.55 0 0 1
3 1 0 1 30 1 2 151.55 0 0 1
4 1 0 0 25 1 2 151.55 0 0 1
titanic_df = titanic_df.sample(frac=1).reset_index(drop=True)

titanic_df.head()

pclass survived sex age sibsp parch fare embarked_C embarked_Q embarked_S
0 3 0 0 23 0 0 7.925 0 0 1
1 2 0 1 40 1 0 26 0 0 1
2 3 0 0 32 1 1 15.5 0 1 0
3 1 1 0 64 0 2 83.1583 1 0 0
4 3 0 1 20 0 0 7.8542 0 0 1
titanic_df.to_csv('titanic_processed.csv', index=False)

Clasificacion Binaria

titanic_df = pd.read_csv('titanic_processed.csv')

titanic_df.head()

pclass survived sex age sibsp parch fare embarked_C embarked_Q embarked_S
0 3 0 0 23.0 0 0 7.9250 0 0 1
1 2 0 1 40.0 1 0 26.0000 0 0 1
2 3 0 0 32.0 1 1 15.5000 0 1 0
3 1 1 0 64.0 0 2 83.1583 1 0 0
4 3 0 1 20.0 0 0 7.8542 0 0 1
titanic_df.shape
(1043, 10)
from sklearn.model_selection import train_test_split

X_features = titanic_df.drop('survived', axis='columns')
Y_target = titanic_df['survived']

x_train, x_test, y_train, y_test = train_test_split(X_features,
                                                    Y_target,
                                                    test_size=0.2,
                                                    stratify=Y_target)
x_train.shape, y_train.shape
((834, 9), (834,))
x_test.shape, y_test.shape
((209, 9), (209,))

Regresion Logistica para Clasificacion

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)

Matriz de confusion

pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})
pred_results.head()

y_test y_pred
796 1 1
523 0 1
707 1 0
716 1 1
3 1 1
titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

titanic_crosstab

y_test 0 1
y_pred
0 106 19
1 18 66

Precision - recall

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
accuracy_score :  0.8229665071770335
precision_score :  0.7857142857142857
recall_score :  0.7764705882352941
titanic_crosstab

y_test 0 1
y_pred
0 106 19
1 18 66
TP = titanic_crosstab[1][1]
TN = titanic_crosstab[0][0]
FP = titanic_crosstab[0][1]
FN = titanic_crosstab[1][0]
accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)

accuracy_score_verified
0.8229665071770335
precision_score_survived = TP / (TP + FP)

precision_score_survived
0.7857142857142857
recall_score_survived = TP / (TP + FN)

recall_score_survived
0.7764705882352941

Clasificacion con Multiples Modelos

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
titanic_df = pd.read_csv('titanic_processed.csv')

titanic_df.sample(5)

pclass survived sex age sibsp parch fare embarked_C embarked_Q embarked_S
243 2 0 1 25.0 1 0 26.0000 0 0 1
795 3 0 1 44.0 0 0 8.0500 0 0 1
114 1 0 1 36.0 0 0 75.2417 1 0 0
155 3 0 0 16.0 5 2 46.9000 0 0 1
361 3 0 1 70.5 0 0 7.7500 0 1 0
FEATURES = list(titanic_df.columns[1:])

FEATURES
['survived',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'embarked_C',
 'embarked_Q',
 'embarked_S']
result_dict = {}

Funciones de ayuda

def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc, 
            'precision': prec,
            'recall':recall, 
            'accuracy_count':num_acc}
def build_model(classifier_fn,                
                name_of_y_col, 
                names_of_x_cols, 
                dataset, 
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
       
    model = classifier_fn.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)

    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training': train_summary, 
            'test': test_summary,
            'confusion_matrix': model_crosstab}
# funcion para comparar resultados
def compare_results():
    for key in result_dict:
        print('Classification: ', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
       
        print()

Regresion logistica

result_dict['survived ~ logistic'] = build_model(LogisticRegression(solver='liblinear'),
                                              'survived',
                                               FEATURES,
                                               titanic_df)

compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Lineal Discriminant Analysis

result_dict['survived ~ linear_discriminant_analysis'] = build_model(LinearDiscriminantAnalysis(solver='svd'),
                                                                 'survived',
                                                                  FEATURES,
                                                                  titanic_df)
compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7805755395683454
precision 0.7523809523809524
recall 0.6929824561403509
accuracy_count 651

Test data
accuracy 0.7799043062200957
precision 0.7466666666666667
recall 0.6746987951807228
accuracy_count 163
result_dict['survived ~ linear_discriminant_analysis'] = build_model(LinearDiscriminantAnalysis(solver='svd'),
                                                                     'survived',
                                                                      FEATURES[0:-1],
                                                                      titanic_df)
compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7889688249400479
precision 0.7701863354037267
recall 0.7085714285714285
accuracy_count 658

Test data
accuracy 0.7464114832535885
precision 0.6571428571428571
recall 0.6133333333333333
accuracy_count 156

Quadratic Discriminant Analysis

def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(QuadraticDiscriminantAnalysis(),
                                                                        'survived',
                                                                        FEATURES[0:-1],
                                                                        titanic_df)

compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7889688249400479
precision 0.7701863354037267
recall 0.7085714285714285
accuracy_count 658

Test data
accuracy 0.7464114832535885
precision 0.6571428571428571
recall 0.6133333333333333
accuracy_count 156

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.5971223021582733
precision 0.0
recall 0.0
accuracy_count 498

Test data
accuracy 0.5741626794258373
precision 0.0
recall 0.0
accuracy_count 120



/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/discriminant_analysis.py:926: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/discriminant_analysis.py:951: RuntimeWarning: divide by zero encountered in power
  X2 = np.dot(Xm, R * (S ** (-0.5)))
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/discriminant_analysis.py:951: RuntimeWarning: invalid value encountered in multiply
  X2 = np.dot(Xm, R * (S ** (-0.5)))
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/discriminant_analysis.py:954: RuntimeWarning: divide by zero encountered in log
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/discriminant_analysis.py:951: RuntimeWarning: divide by zero encountered in power
  X2 = np.dot(Xm, R * (S ** (-0.5)))
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/discriminant_analysis.py:951: RuntimeWarning: invalid value encountered in multiply
  X2 = np.dot(Xm, R * (S ** (-0.5)))
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/discriminant_analysis.py:954: RuntimeWarning: divide by zero encountered in log
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/jzapata/Sites/JoseRZapata.github.io/.venv/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

SGD

result_dict['survived ~ sgd'] = build_model(SGDClassifier(max_iter=1000,
                                                          tol=1e-3),
                                           'survived',
                                            FEATURES,
                                            titanic_df)

compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7889688249400479
precision 0.7701863354037267
recall 0.7085714285714285
accuracy_count 658

Test data
accuracy 0.7464114832535885
precision 0.6571428571428571
recall 0.6133333333333333
accuracy_count 156

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.5971223021582733
precision 0.0
recall 0.0
accuracy_count 498

Test data
accuracy 0.5741626794258373
precision 0.0
recall 0.0
accuracy_count 120

Classification:  survived ~ sgd

Training data
accuracy 0.9964028776978417
precision 0.9942028985507246
recall 0.997093023255814
accuracy_count 831

Test data
accuracy 0.9952153110047847
precision 1.0
recall 0.9876543209876543
accuracy_count 208

SVC Lineal

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

  • SVC con kernel lineal
  • dual=False cuando el numero de muestras > numero de caracteristicas
result_dict['survived ~ linear_svc'] = build_model( LinearSVC(C=1.0,
                                                              max_iter=1000,
                                                              tol=1e-3,
                                                              dual=False),
                                                  'survived',
                                                   FEATURES,
                                                   titanic_df)

compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7889688249400479
precision 0.7701863354037267
recall 0.7085714285714285
accuracy_count 658

Test data
accuracy 0.7464114832535885
precision 0.6571428571428571
recall 0.6133333333333333
accuracy_count 156

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.5971223021582733
precision 0.0
recall 0.0
accuracy_count 498

Test data
accuracy 0.5741626794258373
precision 0.0
recall 0.0
accuracy_count 120

Classification:  survived ~ sgd

Training data
accuracy 0.9964028776978417
precision 0.9942028985507246
recall 0.997093023255814
accuracy_count 831

Test data
accuracy 0.9952153110047847
precision 1.0
recall 0.9876543209876543
accuracy_count 208

Classification:  survived ~ linear_svc

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Radius Neighbors Classifier

result_dict['survived ~ radius_neighbors'] = build_model(RadiusNeighborsClassifier(radius=40.0),
                                                         'survived',
                                                         FEATURES,
                                                         titanic_df)
compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7889688249400479
precision 0.7701863354037267
recall 0.7085714285714285
accuracy_count 658

Test data
accuracy 0.7464114832535885
precision 0.6571428571428571
recall 0.6133333333333333
accuracy_count 156

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.5971223021582733
precision 0.0
recall 0.0
accuracy_count 498

Test data
accuracy 0.5741626794258373
precision 0.0
recall 0.0
accuracy_count 120

Classification:  survived ~ sgd

Training data
accuracy 0.9964028776978417
precision 0.9942028985507246
recall 0.997093023255814
accuracy_count 831

Test data
accuracy 0.9952153110047847
precision 1.0
recall 0.9876543209876543
accuracy_count 208

Classification:  survived ~ linear_svc

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ radius_neighbors

Training data
accuracy 0.657074340527578
precision 0.6917808219178082
recall 0.2953216374269006
accuracy_count 548

Test data
accuracy 0.6889952153110048
precision 0.6875
recall 0.39759036144578314
accuracy_count 144

Decision Tree classifier

max_depth = None [ If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples ]

max_features = None [None – max_features=n_features, auto – then max_features=sqrt(n_features), sqrt – then max_features=sqrt(n_features), log2 – then max_features=log2(n_features)]

result_dict['survived ~ decision_tree'] = build_model(DecisionTreeClassifier(),
                                                 'survived',
                                                  FEATURES,
                                                  titanic_df)

compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7889688249400479
precision 0.7701863354037267
recall 0.7085714285714285
accuracy_count 658

Test data
accuracy 0.7464114832535885
precision 0.6571428571428571
recall 0.6133333333333333
accuracy_count 156

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.5971223021582733
precision 0.0
recall 0.0
accuracy_count 498

Test data
accuracy 0.5741626794258373
precision 0.0
recall 0.0
accuracy_count 120

Classification:  survived ~ sgd

Training data
accuracy 0.9964028776978417
precision 0.9942028985507246
recall 0.997093023255814
accuracy_count 831

Test data
accuracy 0.9952153110047847
precision 1.0
recall 0.9876543209876543
accuracy_count 208

Classification:  survived ~ linear_svc

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ radius_neighbors

Training data
accuracy 0.657074340527578
precision 0.6917808219178082
recall 0.2953216374269006
accuracy_count 548

Test data
accuracy 0.6889952153110048
precision 0.6875
recall 0.39759036144578314
accuracy_count 144

Classification:  survived ~ decision_tree

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Naive Bayes

result_dict['survived ~ naive_bayes'] = build_model(GaussianNB(),
                                                    'survived',
                                                    FEATURES,
                                                    titanic_df)

compare_results()
Classification:  survived ~ logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7889688249400479
precision 0.7701863354037267
recall 0.7085714285714285
accuracy_count 658

Test data
accuracy 0.7464114832535885
precision 0.6571428571428571
recall 0.6133333333333333
accuracy_count 156

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.5971223021582733
precision 0.0
recall 0.0
accuracy_count 498

Test data
accuracy 0.5741626794258373
precision 0.0
recall 0.0
accuracy_count 120

Classification:  survived ~ sgd

Training data
accuracy 0.9964028776978417
precision 0.9942028985507246
recall 0.997093023255814
accuracy_count 831

Test data
accuracy 0.9952153110047847
precision 1.0
recall 0.9876543209876543
accuracy_count 208

Classification:  survived ~ linear_svc

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ radius_neighbors

Training data
accuracy 0.657074340527578
precision 0.6917808219178082
recall 0.2953216374269006
accuracy_count 548

Test data
accuracy 0.6889952153110048
precision 0.6875
recall 0.39759036144578314
accuracy_count 144

Classification:  survived ~ decision_tree

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Classification:  survived ~ naive_bayes

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 834

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 209

Comparacion de modelos

# Crear un diccionario solo con los resultados de prueba de cada modelo
nombre_modelos = result_dict.keys()
resultados_train = {} # crear diccionario vacio
resultados_test = {} # crear diccionario vacio
for nombre in nombre_modelos:
    resultados_train[nombre] = result_dict[nombre]['training']['accuracy']
    resultados_test[nombre] = result_dict[nombre]['test']['accuracy']

df_comparacion = pd.DataFrame([resultados_train, resultados_test], index=['train', 'test'])
# Plot the bar chart
fig, ax = plt.subplots(figsize=(12, 4))
df_comparacion.T.plot(kind='bar', ax=ax)

# Adjust the layout
ax.set_ylabel('Accuracy')
ax.set_title('Model Accuracy Comparison')

# Set the x-tick labels inside the bars and rotate by 90 degrees
ax.set_xticks(range(len(df_comparacion.columns)))
ax.set_xticklabels([])

# Draw the x-tick labels inside the bars rotated by 90 degrees
for i, label in enumerate(df_comparacion.columns):
    bar_center = (df_comparacion.loc['train', label] + df_comparacion.loc['test', label]) / 2
    ax.text(i, bar_center, label, ha='center', va='center_baseline', rotation=45)

plt.tight_layout()

png

Cross Validation - Seleccion de Modelos

Analizar la varianza de los resultados para obtener los que tengan mejor resultado.

# Grabar los resultados de cada modelo
from sklearn import model_selection

models = []

#logistic Regression
models.append(('Logistic', LogisticRegression(solver='liblinear')))

# Decision Tree classifier
models.append(('Decision Tree', DecisionTreeClassifier()))

#
models.append(('LDA', LinearDiscriminantAnalysis(solver= 'svd')))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    # Kfol cross validation for model selection
    kfold = model_selection.KFold(n_splits=10)
    #X train , y train
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = f"({name}, {cv_results.mean()}, {cv_results.std()}"
    print(msg)
(Logistic, 0.772346528973035, 0.05161741106687893
(Decision Tree, 0.7364314400458978, 0.046319696412797186
(LDA, 0.7735083189902466, 0.043772574310023425
plt.figure(figsize = (15,8)) 
result_df = pd.DataFrame(results, index=names).T
result_df.boxplot()
plt.title("Resultados de Cross Validation");

png

Comparacion Estadistica de Modelos

from scipy.stats import f_oneway

model1 = result_df['Logistic']
model2 = result_df['Decision Tree']
model3 = result_df['LDA']

statistic, p_value = f_oneway(model1, model2, model3)

print(f'Statistic: {statistic}')
print(f'p_value: {p_value}')

alpha = 0.05  # nivel de significancia

if p_value < alpha:
    print("Existe una diferencia estadísticamente significativa en los resultados de cross-validation de los modelos.")
else:
    print("No Existe una diferencia estadísticamente significativa en los resultados de cross-validation de los modelos.")
Statistic: 1.7836579802261752
p_value: 0.1872566216785913
No Existe una diferencia estadísticamente significativa en los resultados de cross-validation de los modelos.

Hyperparameter tunning (Optimizacion de hiperparametros)

titanic_df = pd.read_csv('titanic_processed.csv')

titanic_df.head()

pclass survived sex age sibsp parch fare embarked_C embarked_Q embarked_S
0 3 0 0 23.0 0 0 7.9250 0 0 1
1 2 0 1 40.0 1 0 26.0000 0 0 1
2 3 0 0 32.0 1 1 15.5000 0 1 0
3 1 1 0 64.0 0 2 83.1583 1 0 0
4 3 0 1 20.0 0 0 7.8542 0 0 1
X = titanic_df.drop('survived', axis='columns')

Y = titanic_df['survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print("Test data count: ",len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)
    print()

Decision Tree

from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_
{'max_depth': 5}
# Para ver todos los resultados del cross validation
# No es necesario, solo es informativo para ver como varia el modelo
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])

    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])
Parameters:  {'max_depth': 2}
Mean Test Score:  0.790167865707434
Rank:  3
Parameters:  {'max_depth': 4}
Mean Test Score:  0.8009592326139089
Rank:  2
Parameters:  {'max_depth': 5}
Mean Test Score:  0.8093525179856115
Rank:  1
Parameters:  {'max_depth': 7}
Mean Test Score:  0.7865707434052758
Rank:  4
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7613908872901679
Rank:  6
Parameters:  {'max_depth': 10}
Mean Test Score:  0.762589928057554
Rank:  5
decision_tree_model = DecisionTreeClassifier( \
    max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)
y_pred = decision_tree_model.predict(x_test)
summarize_classification(y_test, y_pred)
Test data count:  209
accuracy_count :  153
accuracy_score :  0.7320574162679426
precision_score :  0.8666666666666667
recall_score :  0.52

Regresion logistica

parameters = {'penalty': ['l1', 'l2'], 
              'C': [0.1, 0.4, 0.8, 1, 2, 5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_
{'C': 2, 'penalty': 'l1'}
# Para ver todos los resultados del cross validation
# No es necesario, solo es informativo para ver como varia el modelo
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])
Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7793764988009593
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7961630695443644
Rank:  9
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7937649880095924
Rank:  10
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7925659472422062
Rank:  11
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7985611510791367
Rank:  5
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7961630695443646
Rank:  8
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7985611510791367
Rank:  5
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7973621103117506
Rank:  7
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.8033573141486811
Rank:  1
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.7997601918465228
Rank:  4
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.8033573141486811
Rank:  1
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Score:  0.8009592326139089
Rank:  3
logistic_model = LogisticRegression(solver='liblinear', \
    penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C']). \
    fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)
summarize_classification(y_test, y_pred)
Test data count:  209
accuracy_count :  157
accuracy_score :  0.7511961722488039
precision_score :  0.7926829268292683
recall_score :  0.65

Final Evaluation Test

from sklearn.metrics import classification_report
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import ConfusionMatrixDisplay

Decision Tree

decision_tree_model = DecisionTreeClassifier( \
    max_depth = 4).fit(x_train, y_train)
y_pred = decision_tree_model.predict(x_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.67      0.96      0.79       109
           1       0.92      0.49      0.64       100

    accuracy                           0.74       209
   macro avg       0.80      0.73      0.72       209
weighted avg       0.79      0.74      0.72       209
ConfusionMatrixDisplay.from_predictions(y_test,y_pred);

png

PrecisionRecallDisplay.from_predictions(y_test,y_pred);

png

Regresion Logistica

logistic_model = LogisticRegression(solver='liblinear',
                                    penalty='l1', C=5).fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.72      0.84      0.78       109
           1       0.79      0.65      0.71       100

    accuracy                           0.75       209
   macro avg       0.76      0.75      0.75       209
weighted avg       0.76      0.75      0.75       209
ConfusionMatrixDisplay.from_predictions(y_test,y_pred);

png

PrecisionRecallDisplay.from_predictions(y_test,y_pred);

png

from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay

y_score = logistic_model.decision_function(x_test)

fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=logistic_model.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

png

Grabar el Modelo

from joblib import dump, load # libreria de serializacion

# grabar el modelo en un archivo
dump(logistic_model, 'logistic_model-titanic.joblib')
['logistic_model-titanic.joblib']
from joblib import load

mi_modelo = load('logistic_model-titanic.joblib')
mi_modelo.predict(x_test)
array([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1])

Referencias

Cheatsheet scikit-learn https://datacamp-community-prod.s3.amazonaws.com/5433fa18-9f43-44cc-b228-74672efcd116

Phd. Jose R. Zapata