{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"iris=pd.read_csv('Iris.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" SepalLengthCm | \n",
" SepalWidthCm | \n",
" PetalLengthCm | \n",
" PetalWidthCm | \n",
" Species | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Iris-setosa | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Iris-setosa | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" Iris-setosa | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" Iris-setosa | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Iris-setosa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
"0 1 5.1 3.5 1.4 0.2 Iris-setosa\n",
"1 2 4.9 3.0 1.4 0.2 Iris-setosa\n",
"2 3 4.7 3.2 1.3 0.2 Iris-setosa\n",
"3 4 4.6 3.1 1.5 0.2 Iris-setosa\n",
"4 5 5.0 3.6 1.4 0.2 Iris-setosa"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris['Species'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"iris_setosa=iris[iris['Species']=='Iris-setosa']\n",
"iris_versicolor=iris[iris['Species']=='Iris-versicolor']\n",
"iris_virginica=iris[iris['Species']=='Iris-virginica']"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" SepalLengthCm | \n",
" SepalWidthCm | \n",
" PetalLengthCm | \n",
" PetalWidthCm | \n",
" Species | \n",
"
\n",
" \n",
" \n",
" \n",
" 50 | \n",
" 51 | \n",
" 7.0 | \n",
" 3.2 | \n",
" 4.7 | \n",
" 1.4 | \n",
" Iris-versicolor | \n",
"
\n",
" \n",
" 51 | \n",
" 52 | \n",
" 6.4 | \n",
" 3.2 | \n",
" 4.5 | \n",
" 1.5 | \n",
" Iris-versicolor | \n",
"
\n",
" \n",
" 52 | \n",
" 53 | \n",
" 6.9 | \n",
" 3.1 | \n",
" 4.9 | \n",
" 1.5 | \n",
" Iris-versicolor | \n",
"
\n",
" \n",
" 53 | \n",
" 54 | \n",
" 5.5 | \n",
" 2.3 | \n",
" 4.0 | \n",
" 1.3 | \n",
" Iris-versicolor | \n",
"
\n",
" \n",
" 54 | \n",
" 55 | \n",
" 6.5 | \n",
" 2.8 | \n",
" 4.6 | \n",
" 1.5 | \n",
" Iris-versicolor | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm \\\n",
"50 51 7.0 3.2 4.7 1.4 \n",
"51 52 6.4 3.2 4.5 1.5 \n",
"52 53 6.9 3.1 4.9 1.5 \n",
"53 54 5.5 2.3 4.0 1.3 \n",
"54 55 6.5 2.8 4.6 1.5 \n",
"\n",
" Species \n",
"50 Iris-versicolor \n",
"51 Iris-versicolor \n",
"52 Iris-versicolor \n",
"53 Iris-versicolor \n",
"54 Iris-versicolor "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris_versicolor.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Me calculo los $ \\#C_k /N$:"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.3333333333333333, 0.3333333333333333, 0.3333333333333333)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"N_setosa=len(iris_setosa)/len(iris)\n",
"N_versicolor=len(iris_versicolor)/len(iris)\n",
"N_virginica=len(iris_virginica)/len(iris)\n",
"\n",
"N_setosa, N_versicolor, N_virginica"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" SepalLengthCm | \n",
" SepalWidthCm | \n",
" PetalLengthCm | \n",
" PetalWidthCm | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 50.00000 | \n",
" 50.00000 | \n",
" 50.000000 | \n",
" 50.000000 | \n",
" 50.00000 | \n",
"
\n",
" \n",
" mean | \n",
" 25.50000 | \n",
" 5.00600 | \n",
" 3.418000 | \n",
" 1.464000 | \n",
" 0.24400 | \n",
"
\n",
" \n",
" std | \n",
" 14.57738 | \n",
" 0.35249 | \n",
" 0.381024 | \n",
" 0.173511 | \n",
" 0.10721 | \n",
"
\n",
" \n",
" min | \n",
" 1.00000 | \n",
" 4.30000 | \n",
" 2.300000 | \n",
" 1.000000 | \n",
" 0.10000 | \n",
"
\n",
" \n",
" 25% | \n",
" 13.25000 | \n",
" 4.80000 | \n",
" 3.125000 | \n",
" 1.400000 | \n",
" 0.20000 | \n",
"
\n",
" \n",
" 50% | \n",
" 25.50000 | \n",
" 5.00000 | \n",
" 3.400000 | \n",
" 1.500000 | \n",
" 0.20000 | \n",
"
\n",
" \n",
" 75% | \n",
" 37.75000 | \n",
" 5.20000 | \n",
" 3.675000 | \n",
" 1.575000 | \n",
" 0.30000 | \n",
"
\n",
" \n",
" max | \n",
" 50.00000 | \n",
" 5.80000 | \n",
" 4.400000 | \n",
" 1.900000 | \n",
" 0.60000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm\n",
"count 50.00000 50.00000 50.000000 50.000000 50.00000\n",
"mean 25.50000 5.00600 3.418000 1.464000 0.24400\n",
"std 14.57738 0.35249 0.381024 0.173511 0.10721\n",
"min 1.00000 4.30000 2.300000 1.000000 0.10000\n",
"25% 13.25000 4.80000 3.125000 1.400000 0.20000\n",
"50% 25.50000 5.00000 3.400000 1.500000 0.20000\n",
"75% 37.75000 5.20000 3.675000 1.575000 0.30000\n",
"max 50.00000 5.80000 4.400000 1.900000 0.60000"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris_setosa.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Me defino la función `gauss(x,mu,sigma)` que da\n",
"$$\n",
"\\mathcal{N}(x|\\mu,\\sigma)=\\frac{1}{\\sqrt{2\\pi\\sigma}} e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}}\n",
"$$"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def gauss(x,mu,sigma):\n",
" return np.exp(-(x-mu)**2/(2*sigma**2))/(np.sqrt(2*np.pi*sigma))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Mi nueva instancia $x$ será simplemente la primera fila del dataset"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"x = iris.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Id 1\n",
"SepalLengthCm 5.1\n",
"SepalWidthCm 3.5\n",
"PetalLengthCm 1.4\n",
"PetalWidthCm 0.2\n",
"Species Iris-setosa\n",
"Name: 0, dtype: object"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.12424897959183666"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris_setosa['SepalLengthCm'].var()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Me calculo lo que quiero maximizar:\n",
"$$\n",
" \\# C_k / N_k \\cdot \\prod_i \\mathcal{N}(x_i|\\mu_k^i,\\sigma_k^i)\n",
" $$"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.columns[1:5]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.00014852476867097342"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"P_setosa=N_setosa\n",
"for i in iris.columns[1:5]:\n",
" P_setosa=P_setosa*gauss(x[i],iris_setosa[i].mean(),iris_setosa[i].var())\n",
" \n",
"P_setosa"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.2129917787430672e-177"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"P_virginica=N_virginica\n",
"for i in iris.columns[1:5]:\n",
" P_virginica=P_virginica*gauss(x[i],iris_virginica[i].mean(),iris_virginica[i].var())\n",
" \n",
"P_virginica"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$$ 2.21 \\cdot 10 ^{-177}$$"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.6634597254045788e-231"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"P_versicolor=N_versicolor\n",
"for i in iris.columns[1:5]:\n",
" P_versicolor=P_versicolor*gauss(x[i],iris_versicolor[i].mean(),iris_versicolor[i].var())\n",
" \n",
"P_versicolor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Predigo que es de tipo Iris-setosa"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ejercicio\n",
"Automatizar todo esto."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Hagámoslo ahora automáticamente con sklearn"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SepalLengthCm | \n",
" SepalWidthCm | \n",
" PetalLengthCm | \n",
" PetalWidthCm | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
"
\n",
" \n",
" 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
"
\n",
" \n",
" 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm\n",
"0 5.1 3.5 1.4 0.2\n",
"1 4.9 3.0 1.4 0.2\n",
"2 4.7 3.2 1.3 0.2\n",
"3 4.6 3.1 1.5 0.2\n",
"4 5.0 3.6 1.4 0.2"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X=iris[iris.columns[1:5]]\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"y=iris['Species']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Separamos en train y test"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"train_X,val_X,train_y,val_y = train_test_split(X,y,random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import GaussianNB"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"# Definimos\n",
"gnb = GaussianNB()\n",
"# Ajsutamos\n",
"gnb.fit(train_X,train_y)\n",
"# Predecimos\n",
"y_pred = gnb.predict(val_X)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Número de puntos mal asignados de un total de 38 puntos: 1\n"
]
}
],
"source": [
"print('Número de puntos mal asignados de un total de %d puntos: %d' % (len(val_X),(val_y!=y_pred).sum()))"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"14 Iris-setosa\n",
"98 Iris-versicolor\n",
"75 Iris-versicolor\n",
"16 Iris-setosa\n",
"131 Iris-virginica\n",
"56 Iris-versicolor\n",
"141 Iris-virginica\n",
"44 Iris-setosa\n",
"29 Iris-setosa\n",
"120 Iris-virginica\n",
"94 Iris-versicolor\n",
"5 Iris-setosa\n",
"102 Iris-virginica\n",
"51 Iris-versicolor\n",
"78 Iris-versicolor\n",
"42 Iris-setosa\n",
"92 Iris-versicolor\n",
"66 Iris-versicolor\n",
"31 Iris-setosa\n",
"35 Iris-setosa\n",
"90 Iris-versicolor\n",
"84 Iris-versicolor\n",
"77 Iris-versicolor\n",
"40 Iris-setosa\n",
"125 Iris-virginica\n",
"99 Iris-versicolor\n",
"33 Iris-setosa\n",
"19 Iris-setosa\n",
"73 Iris-versicolor\n",
"146 Iris-virginica\n",
"91 Iris-versicolor\n",
"135 Iris-virginica\n",
"69 Iris-versicolor\n",
"128 Iris-virginica\n",
"114 Iris-virginica\n",
"48 Iris-setosa\n",
"53 Iris-versicolor\n",
"28 Iris-setosa\n",
"Name: Species, dtype: object"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val_y"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',\n",
" 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',\n",
" 'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',\n",
" 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',\n",
" 'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',\n",
" 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',\n",
" 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',\n",
" 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',\n",
" 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',\n",
" 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',\n",
" 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa'],\n",
" dtype='