import pandas as pd
import numpy as np

data = pd.read_csv('titanic/train.csv')
data.head()

Apaño el dataset:

# Dummies para Sex
data['Sex']=data['Sex'].map({'female':0,'male':1}).astype(int)
data.head()

# Imputo los missings de Age con la mediana
data['Age']=data['Age'].fillna(data['Age'].median())

# Dummies para adulto o no
data['Adult']=(data['Age'] >= 18).apply(int)
data.head()

Me defino las features y el target

X=data[['Pclass','Sex','Adult']]
y=data['Survived']

Separo en training y test

from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

Construyo el modelo:

from sklearn.neighbors import KNeighborsClassifier

# Definir
knn = KNeighborsClassifier(n_neighbors=3)
# Ajustar
knn.fit(train_X,train_y)
# Predecir
pred_y = knn.predict(val_X)

Evaluamos el modelo:

from sklearn.metrics import accuracy_score

# Evaluar
accuracy_score(val_y,pred_y)

0.7130044843049327

Podemos ver la dependencia del número de vecinos:

for k in range(1,50):
    # Definir
    knn = KNeighborsClassifier(n_neighbors=k)
    # Ajustar
    knn.fit(train_X,train_y)
    # Predecir
    pred_y = knn.predict(val_X)
    # Evaluar
    print("k=", k, "score:", accuracy_score(val_y,pred_y))

k= 1 score: 0.7085201793721974
k= 2 score: 0.7085201793721974
k= 3 score: 0.7130044843049327
k= 4 score: 0.7085201793721974
k= 5 score: 0.7130044843049327
k= 6 score: 0.7130044843049327
k= 7 score: 0.7399103139013453
k= 8 score: 0.757847533632287
k= 9 score: 0.757847533632287
k= 10 score: 0.7533632286995515
k= 11 score: 0.757847533632287
k= 12 score: 0.7533632286995515
k= 13 score: 0.7533632286995515
k= 14 score: 0.7533632286995515
k= 15 score: 0.7533632286995515
k= 16 score: 0.7533632286995515
k= 17 score: 0.757847533632287
k= 18 score: 0.757847533632287
k= 19 score: 0.757847533632287
k= 20 score: 0.757847533632287
k= 21 score: 0.757847533632287
k= 22 score: 0.757847533632287
k= 23 score: 0.757847533632287
k= 24 score: 0.757847533632287
k= 25 score: 0.757847533632287
k= 26 score: 0.757847533632287
k= 27 score: 0.757847533632287
k= 28 score: 0.757847533632287
k= 29 score: 0.757847533632287
k= 30 score: 0.757847533632287
k= 31 score: 0.757847533632287
k= 32 score: 0.757847533632287
k= 33 score: 0.757847533632287
k= 34 score: 0.757847533632287
k= 35 score: 0.7847533632286996
k= 36 score: 0.757847533632287
k= 37 score: 0.7847533632286996
k= 38 score: 0.757847533632287
k= 39 score: 0.7847533632286996
k= 40 score: 0.757847533632287
k= 41 score: 0.7847533632286996
k= 42 score: 0.7847533632286996
k= 43 score: 0.7847533632286996
k= 44 score: 0.757847533632287
k= 45 score: 0.757847533632287
k= 46 score: 0.757847533632287
k= 47 score: 0.757847533632287
k= 48 score: 0.757847533632287
k= 49 score: 0.757847533632287

Para considerar la Hamming distance en Pclass habría que convertirla en dummies:

X.Pclass = X.Pclass.astype(str)
X=pd.get_dummies(X)
X.head()

/home/guillermo/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py:5096: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

X=X.drop('Pclass_3', axis=1)
X.head()

También podemos normalizar la escala de las variables.

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X)
X=scaler.transform(X)

X

/home/guillermo/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/home/guillermo/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.
  """

array([[ 0.73769513,  0.38110919, -0.56568542, -0.51015154],
       [-1.35557354,  0.38110919,  1.76776695, -0.51015154],
       [-1.35557354,  0.38110919, -0.56568542, -0.51015154],
       ...,
       [-1.35557354,  0.38110919, -0.56568542, -0.51015154],
       [ 0.73769513,  0.38110919,  1.76776695, -0.51015154],
       [ 0.73769513,  0.38110919, -0.56568542, -0.51015154]])

train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

for k in range(1,50):
    # Definir
    knn = KNeighborsClassifier(n_neighbors=k)
    # Ajustar
    knn.fit(train_X,train_y)
    # Predecir
    pred_y = knn.predict(val_X)
    # Evaluar
    print("k=", k, "score:", accuracy_score(val_y,pred_y))

k= 1 score: 0.7533632286995515
k= 2 score: 0.7533632286995515
k= 3 score: 0.7085201793721974
k= 4 score: 0.7533632286995515
k= 5 score: 0.7085201793721974
k= 6 score: 0.7533632286995515
k= 7 score: 0.7533632286995515
k= 8 score: 0.7533632286995515
k= 9 score: 0.7533632286995515
k= 10 score: 0.7533632286995515
k= 11 score: 0.7533632286995515
k= 12 score: 0.7533632286995515
k= 13 score: 0.7533632286995515
k= 14 score: 0.7533632286995515
k= 15 score: 0.7533632286995515
k= 16 score: 0.7533632286995515
k= 17 score: 0.7533632286995515
k= 18 score: 0.7533632286995515
k= 19 score: 0.7533632286995515
k= 20 score: 0.7533632286995515
k= 21 score: 0.7802690582959642
k= 22 score: 0.7533632286995515
k= 23 score: 0.757847533632287
k= 24 score: 0.757847533632287
k= 25 score: 0.7847533632286996
k= 26 score: 0.7847533632286996
k= 27 score: 0.7847533632286996
k= 28 score: 0.7847533632286996
k= 29 score: 0.7802690582959642
k= 30 score: 0.7802690582959642
k= 31 score: 0.7802690582959642
k= 32 score: 0.7533632286995515
k= 33 score: 0.7802690582959642
k= 34 score: 0.7802690582959642
k= 35 score: 0.7802690582959642
k= 36 score: 0.7802690582959642
k= 37 score: 0.7802690582959642
k= 38 score: 0.7802690582959642
k= 39 score: 0.7802690582959642
k= 40 score: 0.7533632286995515
k= 41 score: 0.7533632286995515
k= 42 score: 0.7533632286995515
k= 43 score: 0.7533632286995515
k= 44 score: 0.7533632286995515
k= 45 score: 0.7533632286995515
k= 46 score: 0.7533632286995515
k= 47 score: 0.7533632286995515
k= 48 score: 0.7533632286995515
k= 49 score: 0.7533632286995515

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	1	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	0	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	1	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Adult
0	1	0	3	Braund, Mr. Owen Harris	1	22.0	1	A/5 21171	7.2500	NaN	S	1
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	38.0	1	PC 17599	71.2833	C85	C	1
2	3	1	3	Heikkinen, Miss. Laina	0	26.0	0	STON/O2. 3101282	7.9250	NaN	S	1
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	35.0	1	113803	53.1000	C123	S	1
4	5	0	3	Allen, Mr. William Henry	1	35.0	0	373450	8.0500	NaN	S	1

	Sex	Adult	Pclass_1	Pclass_3
0	1	1	0	1
1	0	1	1	0
2	0	1	0	1
3	0	1	1	0
4	1	1	0	1

	Sex	Adult	Pclass_1
0	1	1	0
1	0	1	1
2	0	1	0
3	0	1	1
4	1	1	0