import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import hvplot.pandas
from scipy import stats
%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')11 Modelos de clasificación
Objetivo
El objetivo de esta clase es que los estudiantes aprendan a aplicar técnicas de análisis exploratorio de datos en Python, utilizando herramientas como pandas, matplotlib y seaborn, para identificar patrones y relaciones clave en los datos.
La clasificación es una tarea fundamental en el aprendizaje automático que implica asignar una categoría o etiqueta a cada observación en función de sus características. Se utiliza ampliamente en diversas aplicaciones, como el reconocimiento de imágenes y voz, el análisis de sentimientos, la detección de fraudes y muchas otras. En este módulo, nos sumergiremos en el mundo de los algoritmos de clasificación con Python y exploraremos algunos de los modelos más populares y potentes.
- Logistic Regression
- Artificial Neural Networks (Coming soon)
- K-nearest Neighbors
- Support Vector Machine
- Decision Trees Classifier
- Random Forest Classifier
- XGBoost Classifier
Predicción de enfermedades cardíacas mediante aprendizaje automático
1. Definición del problema
Dados los parámetros clínicos de un paciente, ¿podemos predecir si tiene o no una enfermedad cardíaca?
2. Características
Aquí obtendrá información diferente sobre cada una de las características de sus datos. Puede hacerlo investigando por su cuenta (por ejemplo, consultando los enlaces anteriores) o hablando con un experto en la materia (alguien que conozca el conjunto de datos).
age- age in yearssex- (1 = male; 0 = female)cp- chest pain type- 0: Typical angina: chest pain related decrease blood supply to the heart
- 1: Atypical angina: chest pain not related to heart
- 2: Non-anginal pain: typically esophageal spasms (non heart related)
- 3: Asymptomatic: chest pain not showing signs of disease
trestbps- resting blood pressure (in mm Hg on admission to the hospital) anything above 130-140 is typically cause for concernchol- serum cholestoral in mg/dl- serum = LDL + HDL + .2 * triglycerides
- above 200 is cause for concern
fbs- (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)- ‘>126’ mg/dL signals diabetes
restecg- resting electrocardiographic results- 0: Nothing to note
- 1: ST-T Wave abnormality
- can range from mild symptoms to severe problems
- signals non-normal heart beat
- 2: Possible or definite left ventricular hypertrophy
- Enlarged heart’s main pumping chamber
thalach- maximum heart rate achievedexang- exercise induced angina (1 = yes; 0 = no)oldpeak- ST depression induced by exercise relative to rest looks at stress of heart during excercise unhealthy heart will stress moreslope- the slope of the peak exercise ST segment- 0: Upsloping: better heart rate with excercise (uncommon)
- 1: Flatsloping: minimal change (typical healthy heart)
- 2: Downslopins: signs of unhealthy heart
ca- number of major vessels (0-3) colored by flourosopy- colored vessel means the doctor can see the blood passing through
- the more blood movement the better (no clots)
thal- thalium stress result- 1,3: normal
- 6: fixed defect: used to be defect but ok now
- 7: reversable defect: no proper blood movement when excercising
target- have disease or not (1=yes, 0=no) (= the predicted attribute)
data = pd.read_csv('heart.csv')
data.head()11.1 Análisis exploratorio de datos (EDA)
El objetivo aquí es obtener más información sobre los datos.
- ¿Qué preguntas está intentando resolver?
- ¿Qué tipo de datos tenemos y cómo tratamos los diferentes tipos?
- ¿Qué falta en los datos y cómo se maneja?
- ¿Dónde están los valores atípicos y por qué deberían importarle?
- ¿Cómo puede agregar, cambiar o eliminar características para aprovechar al máximo sus datos?
data.info()data.shapepd.set_option('display.float', '{:.2f}'.format)
data.describe()data.target.value_counts()sns.countplot(data,x='target')
plt.show()data.isna().sum()
- Tenemos 165 personas con enfermedades cardíacas y 138 personas sin ellas, por lo que nuestro problema está balanceado.
- ¡Parece el conjunto de datos perfecto! No hay valores nulos :-)
categorical_val = []
continous_val = []
for column in data.columns:
if len(data[column].unique()) <= 10:
categorical_val.append(column)
else:
continous_val.append(column)categorical_valsns.countplot(data,x='sex',hue='target')
plt.show()sns.countplot(data,x='cp',hue='target')
plt.show()sns.countplot(data,x='fbs',hue='target')
plt.show()plt.figure(figsize=(9, 7))
sns.scatterplot(data,x='age',y='thalach',hue='target')
plt.show()corr_matrix = data.corr()
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(corr_matrix,
vmin=-1.0,
vmax=1.0,
annot=True,
linewidths=0.5,
fmt='.2f',
cmap='YlGnBu');
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()11.2 Procesamiento de los datos
Después de explorar el conjunto de datos, se observa que es necesario convertir algunas variables categóricas en variables ficticias y escalar todos los valores antes de entrenar los modelos de aprendizaje automático. Primero, usaremos el método get_dummies para crear columnas ficticias para las variables categóricas.
categorical_val.remove('target')
dataset = pd.get_dummies(data, columns = categorical_val)dataset.head()print(data.columns)
print(dataset.columns)from sklearn.preprocessing import StandardScaler
s_sc = StandardScaler()
col_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])dataset.head()11.3 construcción de los modelos
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
if train:
pred = clf.predict(X_train)
clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
print('Train Result:\n================================================')
print(f'Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%')
print('_______________________________________________')
print(f'CLASSIFICATION REPORT:\n{clf_report}')
print('_______________________________________________')
print(f'Confusion Matrix:')
cm = confusion_matrix(y_train, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=clf.classes_)
disp.plot()
plt.grid(False)
plt.show()
print('\n')
elif train==False:
pred = clf.predict(X_test)
clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
print('Test Result:\n================================================')
print(f'Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%')
print('_______________________________________________')
print(f'CLASSIFICATION REPORT:\n{clf_report}')
print('_______________________________________________')
print(f'Confusion Matrix:')
cm = confusion_matrix(y_test, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=clf.classes_)
disp.plot()
plt.grid(False)
plt.show()from sklearn.model_selection import train_test_split
X = dataset.drop('target', axis=1)
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)Ahora que tenemos nuestros datos divididos en conjuntos de entrenamiento y prueba, es hora de crear un modelo de aprendizaje automático. Lo entrenaremos (encontraremos los patrones) en el conjunto de entrenamiento. Y lo probaremos (usaremos los patrones) en el conjunto de prueba. Vamos a probar 3 modelos de aprendizaje automático diferentes:
- Regresión logística
- Clasificador de K vecinos más cercanos
- Máquina de vectores de soporte
- Clasificador de árbol de decisiones
- Clasificador de bosque aleatorio
- Clasificador XGBoost
11.3.1 Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, lr_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, lr_clf.predict(X_train)) * 100
results_df = pd.DataFrame(data=[['Logistic Regression', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df11.3.2 K-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
print_score(knn_clf, X_train, y_train, X_test, y_test, train=True)
print_score(knn_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, knn_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, knn_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(data=[['K-nearest neighbors', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df,results_df_2])
results_df11.3.3 Support Vector machine
from sklearn.svm import SVC
svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0)
svm_clf.fit(X_train, y_train)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, svm_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, svm_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(data=[['Support Vector Machine', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df,results_df_2])
results_df11.3.4 Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, tree_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, tree_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(data=[['Decision Tree Classifier', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df,results_df_2])
results_df11.3.5 Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_clf.fit(X_train, y_train)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, rf_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, rf_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(data=[['Random Forest Classifier', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df,results_df_2])
results_df11.3.6 XGBoost Classifer
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=True)
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, xgb_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, xgb_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(data=[['XGBoost Classifier', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df,results_df_2])
results_df11.4 Ajuste de hiperparámetros de los modelos
11.4.1 Logistic Regression
from sklearn.model_selection import GridSearchCV
params = {'C': np.logspace(-4, 4, 20),
'solver': ['liblinear']}
lr_clf = LogisticRegression()
lr_cv = GridSearchCV(lr_clf, params, scoring='accuracy', n_jobs=-1, verbose=1, cv=5)
lr_cv.fit(X_train, y_train)
best_params = lr_cv.best_params_
print(f'Best parameters: {best_params}')
lr_clf = LogisticRegression(**best_params)
lr_clf.fit(X_train, y_train)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, lr_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, lr_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(
data=[['Tuned Logistic Regression', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
results_df = pd.concat([results_df,results_df_2])
results_df11.4.2 K-nearest neighbors
train_score = []
test_score = []
neighbors = range(1, 30)
for k in neighbors:
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, y_train)
train_score.append(accuracy_score(y_train, model.predict(X_train)))
test_score.append(accuracy_score(y_test, model.predict(X_test)))plt.figure(figsize=(15, 4))
plt.plot(neighbors, train_score, label='Train score')
plt.plot(neighbors, test_score, label='Test score')
plt.xticks(np.arange(1, 30, 1))
plt.xlabel('Number of neighbors')
plt.ylabel('Model score')
plt.legend()
plt.show()
print(f'Maximum KNN score on the test data: {max(train_score)*100:.2f}%')knn_clf = KNeighborsClassifier(n_neighbors=27)
knn_clf.fit(X_train, y_train)
print_score(knn_clf, X_train, y_train, X_test, y_test, train=True)
print_score(knn_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, knn_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, knn_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(
data=[['Tuned K-nearest neighbors', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
results_df = pd.concat([results_df,results_df_2])
results_df11.4.3 Support Vector Machine
svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0)
params = {'C':(0.1, 0.5, 1, 2, 5, 10, 20),
'gamma':(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1),
'kernel':('linear', 'poly', 'rbf')}
svm_cv = GridSearchCV(svm_clf, params, n_jobs=-1, cv=5, verbose=1, scoring='accuracy')
svm_cv.fit(X_train, y_train)
best_params = svm_cv.best_params_
print(f'Best params: {best_params}')
svm_clf = SVC(**best_params)
svm_clf.fit(X_train, y_train)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, svm_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, svm_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(
data=[['Tuned Support Vector Machine', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
results_df = pd.concat([results_df,results_df_2])
results_df11.4.4 Decision Tree Classifier
params = {'criterion':('gini', 'entropy'),
'splitter':('best', 'random'),
'max_depth':(list(range(1, 20))),
'min_samples_split':[2, 3, 4],
'min_samples_leaf':list(range(1, 20))
}
tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(tree_clf, params, scoring='accuracy', n_jobs=-1, verbose=1, cv=5)
tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f'Best_params: {best_params}')
tree_clf = DecisionTreeClassifier(**best_params)
tree_clf.fit(X_train, y_train)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, tree_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, tree_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(
data=[['Tuned Decision Tree Classifier', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
results_df = pd.concat([results_df,results_df_2])
results_df11.4.5 Random Forest Classifier
n_estimators = [500, 900, 1100, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2, 3, 5, 10, 15, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
params_grid = {
'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf
}
rf_clf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(rf_clf, params_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
rf_cv.fit(X_train, y_train)
best_params = rf_cv.best_params_
print(f'Best parameters: {best_params}')
rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(X_train, y_train)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, rf_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, rf_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(
data=[['Tuned Random Forest Classifier', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
results_df = pd.concat([results_df,results_df_2])
results_df11.4.6 XGBoost Classifier Hyperparameter Tuning
param_grid = dict(
n_estimators=stats.randint(10, 1000),
max_depth=stats.randint(1, 10),
learning_rate=stats.uniform(0, 1)
)
xgb_clf = XGBClassifier(use_label_encoder=False)
xgb_cv = RandomizedSearchCV(
xgb_clf, param_grid, cv=5, n_iter=150,
scoring='accuracy', n_jobs=-1, verbose=1
)
xgb_cv.fit(X_train, y_train)
best_params = xgb_cv.best_params_
print(f'Best paramters: {best_params}')
xgb_clf = XGBClassifier(**best_params)
xgb_clf.fit(X_train, y_train)
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=True)
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=False)test_score = accuracy_score(y_test, xgb_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, xgb_clf.predict(X_train)) * 100
results_df_2 = pd.DataFrame(
data=[['Tuned XGBoost Classifier', train_score, test_score]],
columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
results_df = pd.concat([results_df,results_df_2])
results_df11.5 Importancia de las características según Random Forest y XGBoost
def feature_imp(df, model):
fi = pd.DataFrame()
fi['feature'] = df.columns
fi['importance'] = model.feature_importances_
return fi.sort_values(by='importance', ascending=False)feature_imp(X, rf_clf).plot(kind='barh', figsize=(12,7), legend=False)
plt.show()feature_imp(X, xgb_clf).plot(kind='barh', figsize=(12,7), legend=False)
plt.show()11.6 Ejercicios prácticos
- Cree un nuevo Notebook.
- Guarde el archivo como Ejercicios_practicos_clase_11.ipynb.
- Asigne un título H1 con su nombre.
11.6.1 Ejercicio práctico 1
Implemente dos modelos de clasificación usando la base de datos Wine Quality.