{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "iris=pd.read_csv('Iris.csv')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
015.13.51.40.2Iris-setosa
124.93.01.40.2Iris-setosa
234.73.21.30.2Iris-setosa
344.63.11.50.2Iris-setosa
455.03.61.40.2Iris-setosa
\n", "
" ], "text/plain": [ " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", "0 1 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 2 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 3 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5 5.0 3.6 1.4 0.2 Iris-setosa" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris['Species'].unique()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "iris_setosa=iris[iris['Species']=='Iris-setosa']\n", "iris_versicolor=iris[iris['Species']=='Iris-versicolor']\n", "iris_virginica=iris[iris['Species']=='Iris-virginica']" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
50517.03.24.71.4Iris-versicolor
51526.43.24.51.5Iris-versicolor
52536.93.14.91.5Iris-versicolor
53545.52.34.01.3Iris-versicolor
54556.52.84.61.5Iris-versicolor
\n", "
" ], "text/plain": [ " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm \\\n", "50 51 7.0 3.2 4.7 1.4 \n", "51 52 6.4 3.2 4.5 1.5 \n", "52 53 6.9 3.1 4.9 1.5 \n", "53 54 5.5 2.3 4.0 1.3 \n", "54 55 6.5 2.8 4.6 1.5 \n", "\n", " Species \n", "50 Iris-versicolor \n", "51 Iris-versicolor \n", "52 Iris-versicolor \n", "53 Iris-versicolor \n", "54 Iris-versicolor " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_versicolor.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Me calculo los $ \\#C_k /N$:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.3333333333333333, 0.3333333333333333, 0.3333333333333333)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "N_setosa=len(iris_setosa)/len(iris)\n", "N_versicolor=len(iris_versicolor)/len(iris)\n", "N_virginica=len(iris_virginica)/len(iris)\n", "\n", "N_setosa, N_versicolor, N_virginica" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCm
count50.0000050.0000050.00000050.00000050.00000
mean25.500005.006003.4180001.4640000.24400
std14.577380.352490.3810240.1735110.10721
min1.000004.300002.3000001.0000000.10000
25%13.250004.800003.1250001.4000000.20000
50%25.500005.000003.4000001.5000000.20000
75%37.750005.200003.6750001.5750000.30000
max50.000005.800004.4000001.9000000.60000
\n", "
" ], "text/plain": [ " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm\n", "count 50.00000 50.00000 50.000000 50.000000 50.00000\n", "mean 25.50000 5.00600 3.418000 1.464000 0.24400\n", "std 14.57738 0.35249 0.381024 0.173511 0.10721\n", "min 1.00000 4.30000 2.300000 1.000000 0.10000\n", "25% 13.25000 4.80000 3.125000 1.400000 0.20000\n", "50% 25.50000 5.00000 3.400000 1.500000 0.20000\n", "75% 37.75000 5.20000 3.675000 1.575000 0.30000\n", "max 50.00000 5.80000 4.400000 1.900000 0.60000" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_setosa.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Me defino la función `gauss(x,mu,sigma)` que da\n", "$$\n", "\\mathcal{N}(x|\\mu,\\sigma)=\\frac{1}{\\sqrt{2\\pi\\sigma}} e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}}\n", "$$" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "def gauss(x,mu,sigma):\n", " return np.exp(-(x-mu)**2/(2*sigma**2))/(np.sqrt(2*np.pi*sigma))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Mi nueva instancia $x$ será simplemente la primera fila del dataset" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "x = iris.iloc[0]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Id 1\n", "SepalLengthCm 5.1\n", "SepalWidthCm 3.5\n", "PetalLengthCm 1.4\n", "PetalWidthCm 0.2\n", "Species Iris-setosa\n", "Name: 0, dtype: object" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.12424897959183666" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_setosa['SepalLengthCm'].var()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Me calculo lo que quiero maximizar:\n", "$$\n", " \\# C_k / N_k \\cdot \\prod_i \\mathcal{N}(x_i|\\mu_k^i,\\sigma_k^i)\n", " $$" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.columns[1:5]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.00014852476867097342" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "P_setosa=N_setosa\n", "for i in iris.columns[1:5]:\n", " P_setosa=P_setosa*gauss(x[i],iris_setosa[i].mean(),iris_setosa[i].var())\n", " \n", "P_setosa" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.2129917787430672e-177" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "P_virginica=N_virginica\n", "for i in iris.columns[1:5]:\n", " P_virginica=P_virginica*gauss(x[i],iris_virginica[i].mean(),iris_virginica[i].var())\n", " \n", "P_virginica" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$$ 2.21 \\cdot 10 ^{-177}$$" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.6634597254045788e-231" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "P_versicolor=N_versicolor\n", "for i in iris.columns[1:5]:\n", " P_versicolor=P_versicolor*gauss(x[i],iris_versicolor[i].mean(),iris_versicolor[i].var())\n", " \n", "P_versicolor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Predigo que es de tipo Iris-setosa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Ejercicio\n", "Automatizar todo esto." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hagámoslo ahora automáticamente con sklearn" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCm
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2
\n", "
" ], "text/plain": [ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm\n", "0 5.1 3.5 1.4 0.2\n", "1 4.9 3.0 1.4 0.2\n", "2 4.7 3.2 1.3 0.2\n", "3 4.6 3.1 1.5 0.2\n", "4 5.0 3.6 1.4 0.2" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X=iris[iris.columns[1:5]]\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "y=iris['Species']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Separamos en train y test" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "train_X,val_X,train_y,val_y = train_test_split(X,y,random_state=1)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "from sklearn.naive_bayes import GaussianNB" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "# Definimos\n", "gnb = GaussianNB()\n", "# Ajsutamos\n", "gnb.fit(train_X,train_y)\n", "# Predecimos\n", "y_pred = gnb.predict(val_X)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Número de puntos mal asignados de un total de 38 puntos: 1\n" ] } ], "source": [ "print('Número de puntos mal asignados de un total de %d puntos: %d' % (len(val_X),(val_y!=y_pred).sum()))" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "14 Iris-setosa\n", "98 Iris-versicolor\n", "75 Iris-versicolor\n", "16 Iris-setosa\n", "131 Iris-virginica\n", "56 Iris-versicolor\n", "141 Iris-virginica\n", "44 Iris-setosa\n", "29 Iris-setosa\n", "120 Iris-virginica\n", "94 Iris-versicolor\n", "5 Iris-setosa\n", "102 Iris-virginica\n", "51 Iris-versicolor\n", "78 Iris-versicolor\n", "42 Iris-setosa\n", "92 Iris-versicolor\n", "66 Iris-versicolor\n", "31 Iris-setosa\n", "35 Iris-setosa\n", "90 Iris-versicolor\n", "84 Iris-versicolor\n", "77 Iris-versicolor\n", "40 Iris-setosa\n", "125 Iris-virginica\n", "99 Iris-versicolor\n", "33 Iris-setosa\n", "19 Iris-setosa\n", "73 Iris-versicolor\n", "146 Iris-virginica\n", "91 Iris-versicolor\n", "135 Iris-virginica\n", "69 Iris-versicolor\n", "128 Iris-virginica\n", "114 Iris-virginica\n", "48 Iris-setosa\n", "53 Iris-versicolor\n", "28 Iris-setosa\n", "Name: Species, dtype: object" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val_y" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',\n", " 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',\n", " 'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',\n", " 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',\n", " 'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',\n", " 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',\n", " 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',\n", " 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',\n", " 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',\n", " 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',\n", " 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa'],\n", " dtype='