Ejemplo 1

Generamos nosotros los datos aleatoriamente

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [2]:
# Generamos datos esféricos
from sklearn.datasets.samples_generator import make_blobs
In [3]:
X, y = make_blobs(n_samples=300, random_state=1)
In [6]:
X # Son puntos del plano
Out[6]:
array([[-1.10195984e+01, -3.15882031e+00],
       [-6.38088086e+00, -8.50663809e+00],
       [-1.12174569e+01, -1.50397990e+00],
       [-1.61589091e+00,  4.18017563e+00],
       [-1.46126019e+00,  4.52549851e+00],
       [-1.13009458e+00,  4.54419108e+00],
       [-1.01336898e+01, -4.75061833e+00],
       [-7.58703957e-01,  3.72276201e+00],
       [-2.30647659e+00,  5.30797676e+00],
       [-7.48773915e+00, -8.06676370e+00],
       [-7.32386504e+00, -7.96393491e+00],
       [-5.67856792e+00, -7.60509852e+00],
       [-8.43890696e+00, -3.84394585e+00],
       [-1.09947323e+01, -4.06014253e+00],
       [-8.17259469e+00, -8.17094642e+00],
       [-1.41076074e+00,  4.10984872e+00],
       [-1.13296642e+01, -5.71403711e+00],
       [-1.25041532e+01, -6.06751247e+00],
       [-1.34052081e+00,  4.15711949e+00],
       [-1.01077040e+01, -3.94479960e+00],
       [-8.18717759e+00, -5.25907547e+00],
       [-1.01659113e+01, -4.12752889e+00],
       [-5.27930518e-01,  5.92630669e+00],
       [-1.37889483e+00,  4.33337717e+00],
       [-2.80207810e+00,  4.05714715e+00],
       [-1.16719083e+01, -3.12831872e+00],
       [-1.00330804e+01, -1.84274349e+00],
       [-1.92744799e+00,  4.93684534e+00],
       [-1.02518924e+01, -2.55350460e+00],
       [-1.08278844e+01, -4.83392615e+00],
       [-1.96576392e+00,  5.23446451e+00],
       [-6.41623854e+00, -8.04588481e+00],
       [-5.90344220e+00, -8.18075749e+00],
       [-9.53654840e+00, -5.12933122e+00],
       [-7.20807793e+00, -7.12024433e+00],
       [-8.38076290e+00, -3.45060767e+00],
       [-8.22505229e+00, -6.80312129e+00],
       [-1.85139546e+00,  3.51886090e+00],
       [-8.63062033e+00, -7.13940564e+00],
       [-1.69825542e+00,  2.79071751e+00],
       [-1.97451969e-01,  2.34634916e+00],
       [-6.61472667e+00, -6.38789300e+00],
       [-9.79490066e-01,  4.08668827e+00],
       [-2.27956075e+00,  5.10452190e+00],
       [-9.37662980e+00, -2.99722684e+00],
       [-1.03242120e+01, -5.29602434e+00],
       [-1.98197711e+00,  4.02243551e+00],
       [-5.62200526e+00, -8.69290967e+00],
       [-1.09316272e+01, -4.48636887e+00],
       [-9.71503679e+00, -4.77944598e+00],
       [-6.69765037e+00, -6.92032891e+00],
       [-1.06990569e+01, -4.49057157e+00],
       [-9.34313235e+00, -4.00453699e+00],
       [-9.89148978e+00, -5.47902886e+00],
       [-3.85803976e-01,  6.37359162e+00],
       [-1.14242679e+01, -2.18538860e+00],
       [-1.34392496e+00,  2.38428865e+00],
       [-6.89501293e+00, -9.31723608e+00],
       [-1.14663009e+00,  4.10839703e+00],
       [-1.03426997e+01, -4.18418829e+00],
       [-8.16299488e+00, -3.38896569e+00],
       [-7.17022690e+00, -7.52303243e+00],
       [-7.43968687e+00, -8.90319772e+00],
       [-2.33080604e+00,  4.39382527e+00],
       [-4.60642026e-01,  4.59164629e+00],
       [-6.69242533e+00, -8.30171791e+00],
       [-7.40878289e+00, -9.15339729e+00],
       [-1.84048021e+00,  3.80256924e+00],
       [-7.53103704e+00, -6.76823676e+00],
       [-9.16170778e+00, -2.40998944e+00],
       [-2.40671820e+00,  6.09894447e+00],
       [-1.42946517e+00,  5.16850105e+00],
       [-1.37397258e+00,  5.29163103e+00],
       [-1.07233096e+01, -4.82111722e+00],
       [-2.88961804e+00,  4.95702736e+00],
       [-1.05434867e+01, -4.15218641e+00],
       [-6.61326623e+00, -7.04905378e+00],
       [ 2.42271161e-04,  5.14853403e+00],
       [-1.77000693e+00,  3.78912781e+00],
       [-9.98435983e+00, -4.64804214e+00],
       [-1.06981110e+01, -4.15674300e+00],
       [-8.59017250e+00, -3.82424697e+00],
       [-2.35122066e+00,  4.00973634e+00],
       [-1.19410359e+01, -3.60085418e+00],
       [-1.96967668e+00,  1.97165210e+00],
       [-7.84274101e+00, -7.03738700e+00],
       [-1.16484338e+01, -4.84390413e+00],
       [-1.27907975e+01, -2.01581973e+00],
       [-7.90673749e-01,  5.15690151e+00],
       [-9.75504306e+00, -3.75151837e+00],
       [-7.91633784e+00, -7.15234668e+00],
       [-1.21819546e+00,  4.30633464e+00],
       [-1.17536381e+01, -3.23855895e+00],
       [-6.75460990e+00, -1.02474759e+01],
       [-1.04464505e+01, -4.62579659e+00],
       [-7.69022419e+00, -8.74707117e+00],
       [-9.20268641e+00, -4.32778687e+00],
       [-1.20349137e+01, -5.89593773e+00],
       [-6.40320111e+00, -7.16687592e+00],
       [-7.02993859e+00, -6.69931052e+00],
       [-9.71296439e+00, -3.69088110e+00],
       [-6.42829877e+00, -6.74397472e+00],
       [-1.00051011e+00,  2.77905153e+00],
       [-7.31655639e+00, -7.77051293e+00],
       [-6.93650519e+00, -6.39281292e+00],
       [-6.20735304e-01,  6.59346952e+00],
       [-1.53773863e+00,  5.53597378e+00],
       [-9.50537595e+00, -4.63402669e+00],
       [-6.28485505e+00, -8.78266971e+00],
       [-9.33669222e+00, -2.16119034e+00],
       [-7.54403928e+00, -7.80306095e+00],
       [-7.85988444e+00, -4.73888254e+00],
       [-7.87372938e+00, -7.59578865e+00],
       [-7.94152277e-01,  2.10495117e+00],
       [-1.04399418e+01, -3.62982119e+00],
       [-2.77687025e+00,  4.64090557e+00],
       [-6.59823892e+00, -7.11954124e+00],
       [-1.01927698e+01, -3.14795512e+00],
       [-9.84144865e+00, -4.14356957e+00],
       [-1.79600465e+00,  4.28743568e+00],
       [-1.07035530e+01, -2.76066248e+00],
       [-1.88188805e+00,  4.20573180e+00],
       [-6.22402063e+00, -8.25611532e+00],
       [-7.94653906e+00, -3.36768655e+00],
       [-6.21160000e+00, -8.29293984e+00],
       [-1.84612968e+00,  4.30474400e+00],
       [-1.92577841e+00,  4.43910442e+00],
       [-9.14443128e+00, -4.36637786e+00],
       [-9.47838518e+00, -9.93079448e+00],
       [-1.01181771e+01, -5.18646928e+00],
       [-5.01025808e+00, -8.09981857e+00],
       [-9.99960399e+00, -5.34996897e+00],
       [ 5.31139823e-01,  2.51012895e+00],
       [-5.37107307e+00, -7.95635833e+00],
       [-7.59711620e+00, -7.03509471e+00],
       [-1.15365057e+01, -4.40124373e+00],
       [-7.28729621e+00, -6.68306776e+00],
       [-8.60893311e+00, -4.61469279e+00],
       [-9.72121320e+00, -4.68662015e+00],
       [-9.41306589e+00, -3.62907430e+00],
       [-7.34661845e+00, -6.09687258e+00],
       [-1.16434858e+00,  4.23178671e+00],
       [-1.13898357e+00,  3.26214848e+00],
       [-8.64403847e+00, -8.98185608e+00],
       [-9.98118494e+00, -3.77616083e+00],
       [-6.19391238e+00, -8.66168524e+00],
       [-1.30901393e+00,  3.09420646e+00],
       [-9.37972697e+00, -4.13752487e+00],
       [-8.13784646e+00, -7.65806949e+00],
       [-2.17665436e+00,  3.40946304e+00],
       [-7.44971443e+00, -6.69511987e+00],
       [-1.04959261e+01, -4.26433353e+00],
       [-7.47972908e+00, -7.70128207e+00],
       [-9.37435033e+00, -4.38830523e+00],
       [-3.03267723e+00,  4.72164926e+00],
       [-7.66055006e+00, -8.46234942e+00],
       [-7.24828238e+00, -7.05222790e+00],
       [-1.07796242e+01, -4.39085753e+00],
       [-6.09834293e+00, -7.44017905e+00],
       [-9.56818636e+00, -4.56034695e+00],
       [-9.37590900e+00, -4.55315308e+00],
       [-1.78245013e+00,  3.47072043e+00],
       [-8.66753040e-01,  3.78295914e+00],
       [-6.37151596e+00, -8.91129543e+00],
       [-1.42706535e+00,  5.08904128e+00],
       [-1.11141825e+01, -3.87242145e+00],
       [-5.96588585e+00, -9.05486260e+00],
       [-9.19003455e-01,  3.45278927e+00],
       [-2.10668847e+00,  5.63099757e+00],
       [-1.05552072e+01, -3.01417980e+00],
       [-3.28102793e-01,  4.11918201e+00],
       [-1.01842915e+01, -4.01017303e+00],
       [-1.02356544e+01, -2.79806066e+00],
       [-2.00341358e+00,  4.45008673e+00],
       [-8.48608233e-01,  5.45093196e+00],
       [-1.86845414e+00,  4.99311306e+00],
       [-5.85642586e+00, -7.03752630e+00],
       [-1.02341495e+01, -3.22553505e+00],
       [-6.55394441e+00, -6.44256627e+00],
       [-9.29199482e+00, -9.85256171e+00],
       [-2.34673261e+00,  3.56128423e+00],
       [-1.12345659e+01, -3.07750962e+00],
       [-8.13399258e-01,  3.54697393e+00],
       [-9.86366431e+00, -2.75129369e+00],
       [-3.10367371e+00,  3.90202401e+00],
       [-1.53940095e+00,  5.02369298e+00],
       [-3.51754177e+00,  5.64265390e+00],
       [-7.48382008e+00, -8.63241302e+00],
       [-9.23890684e+00, -3.06843973e+00],
       [-7.93489041e+00, -7.78403764e+00],
       [-9.90228742e+00, -3.03189848e+00],
       [-1.17104176e+00,  4.33091816e+00],
       [-1.58173878e+00,  5.02487013e+00],
       [-2.33022219e+00,  4.78405366e+00],
       [-7.57969185e-01,  4.90898421e+00],
       [-9.97584967e+00, -4.42202236e+00],
       [-1.46864442e+00,  6.50674501e+00],
       [-1.11168279e+01, -1.99726964e+00],
       [-2.41395785e+00,  5.65935802e+00],
       [ 2.45098802e-01,  5.51754657e+00],
       [-5.75867612e+00, -8.75783107e+00],
       [-8.46369500e+00, -8.07146029e+00],
       [-1.00822205e+01, -4.25071043e+00],
       [-6.25603782e+00, -6.36347342e+00],
       [-6.69321189e+00, -6.30021862e+00],
       [-1.05724063e+00,  4.82677207e+00],
       [-6.39048608e+00, -8.87562001e+00],
       [-8.70233178e+00, -4.19462540e+00],
       [-7.29364801e+00, -6.53986673e+00],
       [-9.58041050e+00, -3.16857790e+00],
       [-2.18773166e+00,  3.33352125e+00],
       [-9.48263889e+00, -6.73588302e+00],
       [-7.86442967e+00, -8.44482270e+00],
       [-5.38142198e-01,  4.81539041e+00],
       [-9.01698747e+00, -9.55555725e+00],
       [-1.23606555e+00,  4.48382994e+00],
       [-7.34072825e+00, -6.92427252e+00],
       [-1.14385885e+01, -2.72109548e+00],
       [-4.99221336e-01,  4.77598259e+00],
       [-2.03484486e+00,  3.76775946e+00],
       [-1.05600179e+01, -1.99847047e+00],
       [-6.61375924e+00, -8.84814901e+00],
       [-9.37917337e+00, -4.39652048e+00],
       [-1.47299851e+00,  4.81654152e+00],
       [-5.44396990e+00, -8.95941292e+00],
       [-8.20576492e-01,  5.33759195e+00],
       [-1.04093517e+01, -2.67482046e+00],
       [-1.68417686e+00,  3.63132825e+00],
       [-1.83198811e+00,  3.52863145e+00],
       [-8.17831829e+00, -8.22063813e+00],
       [-2.06043810e+00,  5.23049549e+00],
       [-9.63138049e+00, -4.99793793e+00],
       [-7.26388037e+00, -6.28675673e+00],
       [-7.83219201e+00, -7.47865740e+00],
       [-2.75447175e+00,  4.57587230e+00],
       [-7.13421088e+00, -8.26162017e+00],
       [-1.08749940e+01, -4.82113577e+00],
       [-8.85279507e+00, -7.79138079e+00],
       [-7.62867092e+00, -8.06354170e+00],
       [ 5.26015501e-01,  3.00999353e+00],
       [-6.54946838e+00, -9.26809916e+00],
       [ 8.68765801e-01,  4.15785509e+00],
       [-5.86705134e+00, -6.44863393e+00],
       [-1.04730854e+01, -3.47573837e+00],
       [-1.09679881e+00,  4.64722696e+00],
       [-6.88384344e+00, -7.04605265e+00],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.15637509e+00,  5.69971575e+00],
       [-1.25606826e+00,  5.00006839e+00],
       [-1.01136977e+01, -4.12880752e+00],
       [-1.60875215e+00,  3.76949422e+00],
       [-7.35387953e+00, -8.54504434e+00],
       [-1.13042466e+01, -3.87696807e+00],
       [-7.88734937e+00, -7.43151681e+00],
       [-2.85882794e+00,  5.26983519e+00],
       [-6.92324165e+00, -1.06695320e+01],
       [-1.09531378e+01, -3.36743812e+00],
       [-6.92263081e+00, -7.63972262e+00],
       [-8.88332953e+00, -4.53987249e+00],
       [-1.64215050e+00,  3.28447114e+00],
       [-8.98758533e+00, -3.03333061e+00],
       [-7.52482501e+00, -7.50887444e+00],
       [-1.49952284e+00,  5.28265879e+00],
       [-6.96685539e+00, -3.12876392e+00],
       [-6.28746298e+00, -8.27199928e+00],
       [ 8.52518583e-02,  3.64528297e+00],
       [-5.25790464e-01,  3.30659860e+00],
       [-9.93696231e+00, -3.74222379e+00],
       [-1.08590289e+01, -3.27863702e+00],
       [-6.61359817e+00, -9.83728809e+00],
       [-8.78427666e+00, -8.09610711e+00],
       [-1.35938959e+00,  4.05424002e+00],
       [-5.91798181e+00, -8.20293068e+00],
       [-7.48937497e+00, -8.88475909e+00],
       [-2.93211866e+00,  4.72003759e+00],
       [-9.20734891e+00, -8.98339697e+00],
       [-7.39616535e+00, -7.76668896e+00],
       [-7.04771746e+00, -8.58237038e+00],
       [-6.73224718e-01,  4.62002377e+00],
       [-5.31844709e+00, -8.92829839e+00],
       [-1.61734616e+00,  4.98930508e+00],
       [-5.60398980e+00, -7.56075530e+00],
       [-8.87430034e+00, -3.64808151e+00],
       [-6.02479303e+00, -9.07166814e+00],
       [-5.30463296e+00, -8.21388060e+00],
       [-7.93192918e+00, -5.42450547e+00],
       [-5.75517628e+00, -9.30821074e+00],
       [-9.55954616e+00, -2.83102023e+00],
       [-6.53600244e+00, -1.03903146e+01],
       [-6.05367512e+00, -9.62979077e+00],
       [-9.14500844e+00, -3.91798845e+00],
       [-1.18708735e+01, -3.03273343e+00],
       [-8.57698874e-01,  4.45305717e+00],
       [-1.02768102e+01, -2.33049946e+00],
       [-8.54628324e+00, -4.57138540e+00],
       [-3.19091528e-02,  4.74450157e+00],
       [-1.11800306e+01, -4.61910307e+00],
       [-6.01988777e+00, -7.54471341e+00],
       [-7.66603898e+00, -7.59715459e+00],
       [-7.24251438e+00, -9.66368448e+00]])
In [7]:
y # Son los grupos de cada punto
Out[7]:
array([1, 2, 1, 0, 0, 0, 1, 0, 0, 2, 2, 2, 1, 1, 2, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 2, 0, 0,
       1, 1, 0, 2, 1, 1, 2, 1, 1, 1, 0, 1, 0, 2, 0, 1, 1, 2, 2, 0, 0, 2,
       2, 0, 2, 1, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 1, 1, 0, 1, 0, 2, 1, 1,
       0, 1, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 0, 2, 2, 0, 0, 1, 2, 1,
       2, 1, 2, 0, 1, 0, 2, 1, 1, 0, 1, 0, 2, 1, 2, 0, 0, 1, 2, 1, 2, 1,
       0, 2, 2, 1, 2, 1, 1, 1, 2, 0, 0, 2, 1, 2, 0, 1, 2, 0, 2, 1, 2, 1,
       0, 2, 2, 1, 2, 1, 1, 0, 0, 2, 0, 1, 2, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       2, 1, 2, 2, 0, 1, 0, 1, 0, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 2, 2, 1, 2, 2, 0, 2, 2, 2, 1, 0, 1, 2, 0, 2, 0, 2, 1, 0, 0,
       1, 2, 1, 0, 2, 0, 1, 0, 0, 2, 0, 1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 0,
       2, 1, 0, 2, 0, 0, 0, 1, 0, 2, 1, 2, 0, 2, 1, 2, 1, 0, 1, 2, 0, 1,
       2, 0, 0, 1, 1, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 1, 2, 2, 1,
       2, 1, 2, 2, 1, 1, 0, 1, 1, 0, 1, 2, 2, 2])
In [8]:
sns.scatterplot(X[:,0],X[:,1])
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29e9e4c7f0>
In [9]:
# Los agrupamientos reales
sns.scatterplot(X[:,0],X[:,1], hue=y)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29e9b2b6d8>

Voy a predecir agrupamientos con KMedias:

In [10]:
from sklearn.cluster import KMeans
In [11]:
# Definir
kmeans = KMeans(n_clusters=3, n_init=10)
# Con n_init hacemos que inicialize el algoritmo varias veces y escoja el de inercia mínima

# Ajustar
kmeans.fit(X)

# Predecir
y_pred=kmeans.predict(X)

Vamos a ver los agrupamientos que hemos predicho:

In [12]:
sns.scatterplot(X[:,0],X[:,1],hue=y_pred)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29e928e630>

Podemos sacar los centroides:

In [13]:
centroides = kmeans.cluster_centers_
centroides
Out[13]:
array([[ -1.4531567 ,   4.40756967],
       [-10.07499139,  -3.8699274 ],
       [ -7.05318146,  -8.00168371]])

Me los puedo pintar

In [14]:
sns.scatterplot(X[:,0],X[:,1],hue=y_pred)
sns.scatterplot(centroides[:,0],centroides[:,1],color='red')
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29e924d550>

Puedo evaluar el modelo con una confusion matrix

In [15]:
from sklearn.metrics import confusion_matrix
In [16]:
confusion_matrix(y,y_pred)
Out[16]:
array([[100,   0,   0],
       [  0,  99,   1],
       [  0,   1,  99]])

Viendo el dibujo original no estaba tan claro que fueran 3 clusters y no 2. Vamos a hallar el K óptimo usando el método del codo.

*Cómo hallar la inercia:
In [17]:
kmeans.inertia_
Out[17]:
573.9391587506091
In [19]:
inercias=[]

for k in range(1,11):
    kmeans=KMeans(n_clusters=k,n_init=10)
    kmeans.fit(X)
    inercias.append(kmeans.inertia_)
In [20]:
inercias
Out[20]:
[12387.458611910348,
 1884.0764333574893,
 573.9391587506091,
 492.1017017571465,
 420.7181275737736,
 362.43609663838225,
 315.2692980170799,
 268.1845778510234,
 236.08229234483653,
 218.84860124777992]
In [24]:
# Me hago un dibujo del codo

plt.plot(range(1,11),inercias)
plt.title('El método del codo')
plt.xlabel('K')
plt.ylabel('Inercia')
plt.annotate('Codo',xy=(3,600), xytext=(4,3000), arrowprops={'facecolor':'red'})
Out[24]:
Text(4, 3000, 'Codo')

Ejemplo 2

En este ejemplo vamos a trabajar con el dataset Iris

In [25]:
import pandas as pd
In [27]:
iris=pd.read_csv('Iris.csv')
iris.head()
Out[27]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa

Una forma rápida de visualizar los grupos:

In [28]:
iris.columns[1:5]
Out[28]:
Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')
In [29]:
sns.pairplot(iris,vars=iris.columns[1:5])
Out[29]:
<seaborn.axisgrid.PairGrid at 0x7f29e9142278>

Visualizamos los grupos de verdad:

In [30]:
sns.pairplot(iris,vars=iris.columns[1:5],hue='Species')
Out[30]:
<seaborn.axisgrid.PairGrid at 0x7f29e89c4710>

Antes que nada, definamos X e y

In [31]:
X=iris.iloc[:,1:5]
y=iris.Species
In [33]:
X.head()
Out[33]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
In [34]:
y
Out[34]:
0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
5         Iris-setosa
6         Iris-setosa
7         Iris-setosa
8         Iris-setosa
9         Iris-setosa
10        Iris-setosa
11        Iris-setosa
12        Iris-setosa
13        Iris-setosa
14        Iris-setosa
15        Iris-setosa
16        Iris-setosa
17        Iris-setosa
18        Iris-setosa
19        Iris-setosa
20        Iris-setosa
21        Iris-setosa
22        Iris-setosa
23        Iris-setosa
24        Iris-setosa
25        Iris-setosa
26        Iris-setosa
27        Iris-setosa
28        Iris-setosa
29        Iris-setosa
            ...      
120    Iris-virginica
121    Iris-virginica
122    Iris-virginica
123    Iris-virginica
124    Iris-virginica
125    Iris-virginica
126    Iris-virginica
127    Iris-virginica
128    Iris-virginica
129    Iris-virginica
130    Iris-virginica
131    Iris-virginica
132    Iris-virginica
133    Iris-virginica
134    Iris-virginica
135    Iris-virginica
136    Iris-virginica
137    Iris-virginica
138    Iris-virginica
139    Iris-virginica
140    Iris-virginica
141    Iris-virginica
142    Iris-virginica
143    Iris-virginica
144    Iris-virginica
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object

Sabemos que son 3 especies, pero a priori podríamos no saberlo, de hecho a simple vista parecen 2. Usemos el método del codo.

In [37]:
inercias=[]

for k in range(1,11):
    kmeans=KMeans(n_clusters=k,n_init=10)
    kmeans.fit(X)
    inercias.append(kmeans.inertia_)
    
# Me hago un dibujo del codo

plt.plot(range(1,11),inercias)
plt.title('El método del codo')
plt.xlabel('K')
plt.ylabel('Inercia')
plt.annotate('Codo',xy=(3,80), xytext=(4,150), arrowprops={'facecolor':'red'})
Out[37]:
Text(4, 150, 'Codo')

Son 3 clusters (K=3)

Hago el modelo:

In [38]:
kmeans = KMeans(n_clusters=3,n_init=10) #Defino
kmeans.fit(X) # Ajusto
y_pred=kmeans.predict(X) # Predigo
In [39]:
iris['Species_pred']=y_pred
iris.head()
Out[39]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species Species_pred
0 1 5.1 3.5 1.4 0.2 Iris-setosa 1
1 2 4.9 3.0 1.4 0.2 Iris-setosa 1
2 3 4.7 3.2 1.3 0.2 Iris-setosa 1
3 4 4.6 3.1 1.5 0.2 Iris-setosa 1
4 5 5.0 3.6 1.4 0.2 Iris-setosa 1
In [40]:
sns.pairplot(iris,vars=iris.columns[1:5],hue='Species_pred')
Out[40]:
<seaborn.axisgrid.PairGrid at 0x7f29e5ec2e10>

Para compara algunas más concretamente podemos hacer scatter plots

Los de verdad:

In [41]:
sns.scatterplot(iris['PetalLengthCm'],iris['PetalWidthCm'],hue=iris['Species'])
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29e5c00860>
In [42]:
sns.scatterplot(iris['PetalLengthCm'],iris['PetalWidthCm'],hue=iris['Species_pred'])

# Y le añado los centroides
centroides = kmeans.cluster_centers_
sns.scatterplot(centroides[:,2],centroides[:,3], color='red')
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29e541fcc0>
In [ ]: