In [9]:
import pandas as pd
import numpy as np
In [10]:
data = pd.read_csv('titanic/train.csv')
data.head()
Out[10]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Apaño el dataset:

In [11]:
# Dummies para Sex
data['Sex']=data['Sex'].map({'female':0,'male':1}).astype(int)
data.head()
Out[11]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina 0 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry 1 35.0 0 0 373450 8.0500 NaN S
In [12]:
# Imputo los missings de Age con la mediana
data['Age']=data['Age'].fillna(data['Age'].median())
In [13]:
# Dummies para adulto o no
data['Adult']=(data['Age'] >= 18).apply(int)
data.head()
Out[13]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Adult
0 1 0 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina 0 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 1
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry 1 35.0 0 0 373450 8.0500 NaN S 1

Me defino las features y el target

In [14]:
X=data[['Pclass','Sex','Adult']]
y=data['Survived']

Separo en training y test

In [15]:
from sklearn.model_selection import train_test_split
In [16]:
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

Construyo el modelo:

In [17]:
from sklearn.neighbors import KNeighborsClassifier
In [18]:
# Definir
knn = KNeighborsClassifier(n_neighbors=3)
# Ajustar
knn.fit(train_X,train_y)
# Predecir
pred_y = knn.predict(val_X)

Evaluamos el modelo:

In [21]:
from sklearn.metrics import accuracy_score

# Evaluar
accuracy_score(val_y,pred_y)
Out[21]:
0.7130044843049327

Podemos ver la dependencia del número de vecinos:

In [24]:
for k in range(1,50):
    # Definir
    knn = KNeighborsClassifier(n_neighbors=k)
    # Ajustar
    knn.fit(train_X,train_y)
    # Predecir
    pred_y = knn.predict(val_X)
    # Evaluar
    print("k=", k, "score:", accuracy_score(val_y,pred_y))
k= 1 score: 0.7085201793721974
k= 2 score: 0.7085201793721974
k= 3 score: 0.7130044843049327
k= 4 score: 0.7085201793721974
k= 5 score: 0.7130044843049327
k= 6 score: 0.7130044843049327
k= 7 score: 0.7399103139013453
k= 8 score: 0.757847533632287
k= 9 score: 0.757847533632287
k= 10 score: 0.7533632286995515
k= 11 score: 0.757847533632287
k= 12 score: 0.7533632286995515
k= 13 score: 0.7533632286995515
k= 14 score: 0.7533632286995515
k= 15 score: 0.7533632286995515
k= 16 score: 0.7533632286995515
k= 17 score: 0.757847533632287
k= 18 score: 0.757847533632287
k= 19 score: 0.757847533632287
k= 20 score: 0.757847533632287
k= 21 score: 0.757847533632287
k= 22 score: 0.757847533632287
k= 23 score: 0.757847533632287
k= 24 score: 0.757847533632287
k= 25 score: 0.757847533632287
k= 26 score: 0.757847533632287
k= 27 score: 0.757847533632287
k= 28 score: 0.757847533632287
k= 29 score: 0.757847533632287
k= 30 score: 0.757847533632287
k= 31 score: 0.757847533632287
k= 32 score: 0.757847533632287
k= 33 score: 0.757847533632287
k= 34 score: 0.757847533632287
k= 35 score: 0.7847533632286996
k= 36 score: 0.757847533632287
k= 37 score: 0.7847533632286996
k= 38 score: 0.757847533632287
k= 39 score: 0.7847533632286996
k= 40 score: 0.757847533632287
k= 41 score: 0.7847533632286996
k= 42 score: 0.7847533632286996
k= 43 score: 0.7847533632286996
k= 44 score: 0.757847533632287
k= 45 score: 0.757847533632287
k= 46 score: 0.757847533632287
k= 47 score: 0.757847533632287
k= 48 score: 0.757847533632287
k= 49 score: 0.757847533632287

Para considerar la Hamming distance en Pclass habría que convertirla en dummies:

In [25]:
X.Pclass = X.Pclass.astype(str)
X=pd.get_dummies(X)
X.head()
/home/guillermo/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py:5096: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
Out[25]:
Sex Adult Pclass_1 Pclass_2 Pclass_3
0 1 1 0 0 1
1 0 1 1 0 0
2 0 1 0 0 1
3 0 1 1 0 0
4 1 1 0 0 1
In [26]:
X=X.drop('Pclass_3', axis=1)
X.head()
Out[26]:
Sex Adult Pclass_1 Pclass_2
0 1 1 0 0
1 0 1 1 0
2 0 1 0 0
3 0 1 1 0
4 1 1 0 0

También podemos normalizar la escala de las variables.

In [27]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X)
X=scaler.transform(X)

X
/home/guillermo/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/home/guillermo/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.
  """
Out[27]:
array([[ 0.73769513,  0.38110919, -0.56568542, -0.51015154],
       [-1.35557354,  0.38110919,  1.76776695, -0.51015154],
       [-1.35557354,  0.38110919, -0.56568542, -0.51015154],
       ...,
       [-1.35557354,  0.38110919, -0.56568542, -0.51015154],
       [ 0.73769513,  0.38110919,  1.76776695, -0.51015154],
       [ 0.73769513,  0.38110919, -0.56568542, -0.51015154]])
In [28]:
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

for k in range(1,50):
    # Definir
    knn = KNeighborsClassifier(n_neighbors=k)
    # Ajustar
    knn.fit(train_X,train_y)
    # Predecir
    pred_y = knn.predict(val_X)
    # Evaluar
    print("k=", k, "score:", accuracy_score(val_y,pred_y))
k= 1 score: 0.7533632286995515
k= 2 score: 0.7533632286995515
k= 3 score: 0.7085201793721974
k= 4 score: 0.7533632286995515
k= 5 score: 0.7085201793721974
k= 6 score: 0.7533632286995515
k= 7 score: 0.7533632286995515
k= 8 score: 0.7533632286995515
k= 9 score: 0.7533632286995515
k= 10 score: 0.7533632286995515
k= 11 score: 0.7533632286995515
k= 12 score: 0.7533632286995515
k= 13 score: 0.7533632286995515
k= 14 score: 0.7533632286995515
k= 15 score: 0.7533632286995515
k= 16 score: 0.7533632286995515
k= 17 score: 0.7533632286995515
k= 18 score: 0.7533632286995515
k= 19 score: 0.7533632286995515
k= 20 score: 0.7533632286995515
k= 21 score: 0.7802690582959642
k= 22 score: 0.7533632286995515
k= 23 score: 0.757847533632287
k= 24 score: 0.757847533632287
k= 25 score: 0.7847533632286996
k= 26 score: 0.7847533632286996
k= 27 score: 0.7847533632286996
k= 28 score: 0.7847533632286996
k= 29 score: 0.7802690582959642
k= 30 score: 0.7802690582959642
k= 31 score: 0.7802690582959642
k= 32 score: 0.7533632286995515
k= 33 score: 0.7802690582959642
k= 34 score: 0.7802690582959642
k= 35 score: 0.7802690582959642
k= 36 score: 0.7802690582959642
k= 37 score: 0.7802690582959642
k= 38 score: 0.7802690582959642
k= 39 score: 0.7802690582959642
k= 40 score: 0.7533632286995515
k= 41 score: 0.7533632286995515
k= 42 score: 0.7533632286995515
k= 43 score: 0.7533632286995515
k= 44 score: 0.7533632286995515
k= 45 score: 0.7533632286995515
k= 46 score: 0.7533632286995515
k= 47 score: 0.7533632286995515
k= 48 score: 0.7533632286995515
k= 49 score: 0.7533632286995515
In [ ]: