import pandas as pd
import numpy as np
data = pd.read_csv('titanic/train.csv')
data.head()
Apaño el dataset:
# Dummies para Sex
data['Sex']=data['Sex'].map({'female':0,'male':1}).astype(int)
data.head()
# Imputo los missings de Age con la mediana
data['Age']=data['Age'].fillna(data['Age'].median())
# Dummies para adulto o no
data['Adult']=(data['Age'] >= 18).apply(int)
data.head()
Me defino las features y el target
X=data[['Pclass','Sex','Adult']]
y=data['Survived']
Separo en training y test
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)
Construyo el modelo:
from sklearn.neighbors import KNeighborsClassifier
# Definir
knn = KNeighborsClassifier(n_neighbors=3)
# Ajustar
knn.fit(train_X,train_y)
# Predecir
pred_y = knn.predict(val_X)
Evaluamos el modelo:
from sklearn.metrics import accuracy_score
# Evaluar
accuracy_score(val_y,pred_y)
Podemos ver la dependencia del número de vecinos:
for k in range(1,50):
# Definir
knn = KNeighborsClassifier(n_neighbors=k)
# Ajustar
knn.fit(train_X,train_y)
# Predecir
pred_y = knn.predict(val_X)
# Evaluar
print("k=", k, "score:", accuracy_score(val_y,pred_y))
Para considerar la Hamming distance en Pclass habría que convertirla en dummies:
X.Pclass = X.Pclass.astype(str)
X=pd.get_dummies(X)
X.head()
X=X.drop('Pclass_3', axis=1)
X.head()
También podemos normalizar la escala de las variables.
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
X
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)
for k in range(1,50):
# Definir
knn = KNeighborsClassifier(n_neighbors=k)
# Ajustar
knn.fit(train_X,train_y)
# Predecir
pred_y = knn.predict(val_X)
# Evaluar
print("k=", k, "score:", accuracy_score(val_y,pred_y))