In [1]:
import pandas as pd
import numpy as np

Vamos a trabajar con el dataset del titanic: https://www.kaggle.com/c/titanic/overview

In [2]:
titanic = pd.read_csv('titanic/train.csv')
In [3]:
titanic.head()
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Información básica de las columnas:

In [4]:
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

Seleccionamos dos columnas para estudiar las odds ratio y las proporciones

In [5]:
dt = titanic[['Survived','Sex']]
dt.head()
Out[5]:
Survived Sex
0 0 male
1 1 female
2 1 female
3 1 female
4 0 male

Vamos a ver el porcentaje de mujeres y hombres:

In [7]:
dt['Sex'].value_counts()
Out[7]:
male      577
female    314
Name: Sex, dtype: int64
In [8]:
100*dt['Sex'].value_counts()/len(dt['Sex'])
Out[8]:
male      64.758698
female    35.241302
Name: Sex, dtype: float64

Podemos hacer un bar plot:

In [9]:
import seaborn as sns
In [10]:
sns.countplot(dt['Sex'])
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f16a5a91278>

Podemos sacar una tabla como la de antes:

In [11]:
pd.crosstab(index=dt['Survived'],columns=dt['Sex'], margins=True) # margins para sacar los totales
Out[11]:
Sex female male All
Survived
0 81 468 549
1 233 109 342
All 314 577 891

Podemos hacer la tabla con las proporciones/probabilidades de cada clase:

In [12]:
pd.crosstab(index=dt['Survived'],columns=dt['Sex'], margins=True).apply(lambda r: r*100/len(dt))
Out[12]:
Sex female male All
Survived
0 9.090909 52.525253 61.616162
1 26.150393 12.233446 38.383838
All 35.241302 64.758698 100.000000

Igual pero relativo por cada fila:

In [14]:
pd.crosstab(index=dt['Survived'],columns=dt['Sex']).apply(lambda r: r*100/r.sum(), axis=1)
Out[14]:
Sex female male
Survived
0 14.754098 85.245902
1 68.128655 31.871345
In [15]:
# De los que sobreviven cuántos son hombres
109/342
Out[15]:
0.31871345029239767

Relativo por columnas:

In [16]:
pd.crosstab(index=dt['Survived'],columns=dt['Sex']).apply(lambda r: r*100/r.sum(), axis=0)
Out[16]:
Sex female male
Survived
0 25.796178 81.109185
1 74.203822 18.890815
In [17]:
# De los hombres, cuántos sobreviven
109/577
Out[17]:
0.18890814558058924

Podemos hacer otro barplot:

In [18]:
pd.crosstab(index=dt['Survived'],columns=dt['Sex']).apply(lambda r: r*100/r.sum(), axis=0).plot(kind='bar')
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f16a5741748>

Vamos a calcular odds y proporciones:

In [22]:
p_mujer_vive=233/314
p_mujer_vive
Out[22]:
0.7420382165605095
In [23]:
p_mujer_muere=1-p_mujer_vive
In [24]:
p_hombre_vive=109/577
p_hombre_muere=1-p_hombre_vive
In [25]:
# Odds mujer
odds_mujer=p_mujer_vive/p_mujer_muere
odds_mujer
Out[25]:
2.876543209876542
In [26]:
# Odds hombre
odds_hombre=p_hombre_vive/p_hombre_muere
odds_hombre
Out[26]:
0.23290598290598288
In [55]:
# Odds ratio
odds_ratio=odds_hombre/odds_mujer
odds_ratio
Out[55]:
0.08096731594585674
In [28]:
odds_mujer/odds_hombre
Out[28]:
12.350662589194696

Hay funciones que calculan los odds ratio (pero no entraremos demasiado)

In [29]:
import scipy.stats as stats
table=pd.crosstab(index=dt['Survived'],columns=dt['Sex'])
oddsratio, pvalue =stats.fisher_exact(table)

oddsratio
Out[29]:
0.08096731594585672

El modelo:

In [30]:
dt.head()
Out[30]:
Survived Sex
0 0 male
1 1 female
2 1 female
3 1 female
4 0 male

Para transformar la variable Sex en unos y ceros usamos get_dummies:

In [32]:
dt=pd.get_dummies(dt)
dt.head()
Out[32]:
Survived Sex_female Sex_male
0 0 0 1
1 1 1 0
2 1 1 0
3 1 1 0
4 0 0 1

Como es redundante tiramos una:

In [33]:
dt=dt.drop('Sex_female',axis=1)
dt.head()
Out[33]:
Survived Sex_male
0 0 1
1 1 0
2 1 0
3 1 0
4 0 1

Hagamos la regresión logística:

In [34]:
from sklearn.linear_model import LogisticRegression
In [35]:
help(LogisticRegression)
Help on class LogisticRegression in module sklearn.linear_model.logistic:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model.base.LinearClassifierMixin, sklearn.linear_model.base.SparseCoefMixin)
 |  LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None)
 |  
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the cross-
 |  entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag' and 'lbfgs' solvers. It can handle
 |  both dense and sparse input. Use C-ordered arrays or CSR matrices
 |  containing 64-bit floats for optimal performance; any other input format
 |  will be converted (and copied).
 |  
 |  The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization
 |  with primal formulation. The 'liblinear' solver supports both L1 and L2
 |  regularization, with a dual formulation only for the L2 penalty.
 |  
 |  Read more in the :ref:`User Guide <logistic_regression>`.
 |  
 |  Parameters
 |  ----------
 |  penalty : str, 'l1' or 'l2', default: 'l2'
 |      Used to specify the norm used in the penalization. The 'newton-cg',
 |      'sag' and 'lbfgs' solvers support only l2 penalties.
 |  
 |      .. versionadded:: 0.19
 |         l1 penalty with SAGA solver (allowing 'multinomial' + L1)
 |  
 |  dual : bool, default: False
 |      Dual or primal formulation. Dual formulation is only implemented for
 |      l2 penalty with liblinear solver. Prefer dual=False when
 |      n_samples > n_features.
 |  
 |  tol : float, default: 1e-4
 |      Tolerance for stopping criteria.
 |  
 |  C : float, default: 1.0
 |      Inverse of regularization strength; must be a positive float.
 |      Like in support vector machines, smaller values specify stronger
 |      regularization.
 |  
 |  fit_intercept : bool, default: True
 |      Specifies if a constant (a.k.a. bias or intercept) should be
 |      added to the decision function.
 |  
 |  intercept_scaling : float, default 1.
 |      Useful only when the solver 'liblinear' is used
 |      and self.fit_intercept is set to True. In this case, x becomes
 |      [x, self.intercept_scaling],
 |      i.e. a "synthetic" feature with constant value equal to
 |      intercept_scaling is appended to the instance vector.
 |      The intercept becomes ``intercept_scaling * synthetic_feature_weight``.
 |  
 |      Note! the synthetic feature weight is subject to l1/l2 regularization
 |      as all other features.
 |      To lessen the effect of regularization on synthetic feature weight
 |      (and therefore on the intercept) intercept_scaling has to be increased.
 |  
 |  class_weight : dict or 'balanced', default: None
 |      Weights associated with classes in the form ``{class_label: weight}``.
 |      If not given, all classes are supposed to have weight one.
 |  
 |      The "balanced" mode uses the values of y to automatically adjust
 |      weights inversely proportional to class frequencies in the input data
 |      as ``n_samples / (n_classes * np.bincount(y))``.
 |  
 |      Note that these weights will be multiplied with sample_weight (passed
 |      through the fit method) if sample_weight is specified.
 |  
 |      .. versionadded:: 0.17
 |         *class_weight='balanced'*
 |  
 |  random_state : int, RandomState instance or None, optional, default: None
 |      The seed of the pseudo random number generator to use when shuffling
 |      the data.  If int, random_state is the seed used by the random number
 |      generator; If RandomState instance, random_state is the random number
 |      generator; If None, the random number generator is the RandomState
 |      instance used by `np.random`. Used when ``solver`` == 'sag' or
 |      'liblinear'.
 |  
 |  solver : str, {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'},              default: 'liblinear'.
 |  
 |      Algorithm to use in the optimization problem.
 |  
 |      - For small datasets, 'liblinear' is a good choice, whereas 'sag' and
 |        'saga' are faster for large ones.
 |      - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs'
 |        handle multinomial loss; 'liblinear' is limited to one-versus-rest
 |        schemes.
 |      - 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty, whereas
 |        'liblinear' and 'saga' handle L1 penalty.
 |  
 |      Note that 'sag' and 'saga' fast convergence is only guaranteed on
 |      features with approximately the same scale. You can
 |      preprocess the data with a scaler from sklearn.preprocessing.
 |  
 |      .. versionadded:: 0.17
 |         Stochastic Average Gradient descent solver.
 |      .. versionadded:: 0.19
 |         SAGA solver.
 |      .. versionchanged:: 0.20
 |          Default will change from 'liblinear' to 'lbfgs' in 0.22.
 |  
 |  max_iter : int, default: 100
 |      Useful only for the newton-cg, sag and lbfgs solvers.
 |      Maximum number of iterations taken for the solvers to converge.
 |  
 |  multi_class : str, {'ovr', 'multinomial', 'auto'}, default: 'ovr'
 |      If the option chosen is 'ovr', then a binary problem is fit for each
 |      label. For 'multinomial' the loss minimised is the multinomial loss fit
 |      across the entire probability distribution, *even when the data is
 |      binary*. 'multinomial' is unavailable when solver='liblinear'.
 |      'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
 |      and otherwise selects 'multinomial'.
 |  
 |      .. versionadded:: 0.18
 |         Stochastic Average Gradient descent solver for 'multinomial' case.
 |      .. versionchanged:: 0.20
 |          Default will change from 'ovr' to 'auto' in 0.22.
 |  
 |  verbose : int, default: 0
 |      For the liblinear and lbfgs solvers set verbose to any positive
 |      number for verbosity.
 |  
 |  warm_start : bool, default: False
 |      When set to True, reuse the solution of the previous call to fit as
 |      initialization, otherwise, just erase the previous solution.
 |      Useless for liblinear solver. See :term:`the Glossary <warm_start>`.
 |  
 |      .. versionadded:: 0.17
 |         *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.
 |  
 |  n_jobs : int or None, optional (default=None)
 |      Number of CPU cores used when parallelizing over classes if
 |      multi_class='ovr'". This parameter is ignored when the ``solver`` is
 |      set to 'liblinear' regardless of whether 'multi_class' is specified or
 |      not. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
 |      context. ``-1`` means using all processors.
 |      See :term:`Glossary <n_jobs>` for more details.
 |  
 |  Attributes
 |  ----------
 |  
 |  classes_ : array, shape (n_classes, )
 |      A list of class labels known to the classifier.
 |  
 |  coef_ : array, shape (1, n_features) or (n_classes, n_features)
 |      Coefficient of the features in the decision function.
 |  
 |      `coef_` is of shape (1, n_features) when the given problem is binary.
 |      In particular, when `multi_class='multinomial'`, `coef_` corresponds
 |      to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).
 |  
 |  intercept_ : array, shape (1,) or (n_classes,)
 |      Intercept (a.k.a. bias) added to the decision function.
 |  
 |      If `fit_intercept` is set to False, the intercept is set to zero.
 |      `intercept_` is of shape (1,) when the given problem is binary.
 |      In particular, when `multi_class='multinomial'`, `intercept_`
 |      corresponds to outcome 1 (True) and `-intercept_` corresponds to
 |      outcome 0 (False).
 |  
 |  n_iter_ : array, shape (n_classes,) or (1, )
 |      Actual number of iterations for all classes. If binary or multinomial,
 |      it returns only 1 element. For liblinear solver, only the maximum
 |      number of iteration across all classes is given.
 |  
 |      .. versionchanged:: 0.20
 |  
 |          In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
 |          ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
 |  
 |  Examples
 |  --------
 |  >>> from sklearn.datasets import load_iris
 |  >>> from sklearn.linear_model import LogisticRegression
 |  >>> X, y = load_iris(return_X_y=True)
 |  >>> clf = LogisticRegression(random_state=0, solver='lbfgs',
 |  ...                          multi_class='multinomial').fit(X, y)
 |  >>> clf.predict(X[:2, :])
 |  array([0, 0])
 |  >>> clf.predict_proba(X[:2, :]) # doctest: +ELLIPSIS
 |  array([[9.8...e-01, 1.8...e-02, 1.4...e-08],
 |         [9.7...e-01, 2.8...e-02, ...e-08]])
 |  >>> clf.score(X, y)
 |  0.97...
 |  
 |  See also
 |  --------
 |  SGDClassifier : incrementally trained logistic regression (when given
 |      the parameter ``loss="log"``).
 |  LogisticRegressionCV : Logistic regression with built-in cross validation
 |  
 |  Notes
 |  -----
 |  The underlying C implementation uses a random number generator to
 |  select features when fitting the model. It is thus not uncommon,
 |  to have slightly different results for the same input data. If
 |  that happens, try with a smaller tol parameter.
 |  
 |  Predict output may not match that of standalone liblinear in certain
 |  cases. See :ref:`differences from liblinear <liblinear_differences>`
 |  in the narrative documentation.
 |  
 |  References
 |  ----------
 |  
 |  LIBLINEAR -- A Library for Large Linear Classification
 |      https://www.csie.ntu.edu.tw/~cjlin/liblinear/
 |  
 |  SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach
 |      Minimizing Finite Sums with the Stochastic Average Gradient
 |      https://hal.inria.fr/hal-00860051/document
 |  
 |  SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014).
 |      SAGA: A Fast Incremental Gradient Method With Support
 |      for Non-Strongly Convex Composite Objectives
 |      https://arxiv.org/abs/1407.0202
 |  
 |  Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent
 |      methods for logistic regression and maximum entropy models.
 |      Machine Learning 85(1-2):41-75.
 |      https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf
 |  
 |  Method resolution order:
 |      LogisticRegression
 |      sklearn.base.BaseEstimator
 |      sklearn.linear_model.base.LinearClassifierMixin
 |      sklearn.base.ClassifierMixin
 |      sklearn.linear_model.base.SparseCoefMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit(self, X, y, sample_weight=None)
 |      Fit the model according to the given training data.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix}, shape (n_samples, n_features)
 |          Training vector, where n_samples is the number of samples and
 |          n_features is the number of features.
 |      
 |      y : array-like, shape (n_samples,)
 |          Target vector relative to X.
 |      
 |      sample_weight : array-like, shape (n_samples,) optional
 |          Array of weights that are assigned to individual samples.
 |          If not provided, then each sample is given unit weight.
 |      
 |          .. versionadded:: 0.17
 |             *sample_weight* support to LogisticRegression.
 |      
 |      Returns
 |      -------
 |      self : object
 |  
 |  predict_log_proba(self, X)
 |      Log of probability estimates.
 |      
 |      The returned estimates for all classes are ordered by the
 |      label of classes.
 |      
 |      Parameters
 |      ----------
 |      X : array-like, shape = [n_samples, n_features]
 |      
 |      Returns
 |      -------
 |      T : array-like, shape = [n_samples, n_classes]
 |          Returns the log-probability of the sample for each class in the
 |          model, where classes are ordered as they are in ``self.classes_``.
 |  
 |  predict_proba(self, X)
 |      Probability estimates.
 |      
 |      The returned estimates for all classes are ordered by the
 |      label of classes.
 |      
 |      For a multi_class problem, if multi_class is set to be "multinomial"
 |      the softmax function is used to find the predicted probability of
 |      each class.
 |      Else use a one-vs-rest approach, i.e calculate the probability
 |      of each class assuming it to be positive using the logistic function.
 |      and normalize these values across all the classes.
 |      
 |      Parameters
 |      ----------
 |      X : array-like, shape = [n_samples, n_features]
 |      
 |      Returns
 |      -------
 |      T : array-like, shape = [n_samples, n_classes]
 |          Returns the probability of the sample for each class in the model,
 |          where classes are ordered as they are in ``self.classes_``.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __getstate__(self)
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep : boolean, optional
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : mapping of string to any
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as pipelines). The latter have parameters of the form
 |      ``<component>__<parameter>`` so that it's possible to update each
 |      component of a nested object.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.BaseEstimator:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.linear_model.base.LinearClassifierMixin:
 |  
 |  decision_function(self, X)
 |      Predict confidence scores for samples.
 |      
 |      The confidence score for a sample is the signed distance of that
 |      sample to the hyperplane.
 |      
 |      Parameters
 |      ----------
 |      X : array_like or sparse matrix, shape (n_samples, n_features)
 |          Samples.
 |      
 |      Returns
 |      -------
 |      array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
 |          Confidence scores per (sample, class) combination. In the binary
 |          case, confidence score for self.classes_[1] where >0 means this
 |          class would be predicted.
 |  
 |  predict(self, X)
 |      Predict class labels for samples in X.
 |      
 |      Parameters
 |      ----------
 |      X : array_like or sparse matrix, shape (n_samples, n_features)
 |          Samples.
 |      
 |      Returns
 |      -------
 |      C : array, shape [n_samples]
 |          Predicted class label per sample.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.ClassifierMixin:
 |  
 |  score(self, X, y, sample_weight=None)
 |      Returns the mean accuracy on the given test data and labels.
 |      
 |      In multi-label classification, this is the subset accuracy
 |      which is a harsh metric since you require for each sample that
 |      each label set be correctly predicted.
 |      
 |      Parameters
 |      ----------
 |      X : array-like, shape = (n_samples, n_features)
 |          Test samples.
 |      
 |      y : array-like, shape = (n_samples) or (n_samples, n_outputs)
 |          True labels for X.
 |      
 |      sample_weight : array-like, shape = [n_samples], optional
 |          Sample weights.
 |      
 |      Returns
 |      -------
 |      score : float
 |          Mean accuracy of self.predict(X) wrt. y.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.linear_model.base.SparseCoefMixin:
 |  
 |  densify(self)
 |      Convert coefficient matrix to dense array format.
 |      
 |      Converts the ``coef_`` member (back) to a numpy.ndarray. This is the
 |      default format of ``coef_`` and is required for fitting, so calling
 |      this method is only required on models that have previously been
 |      sparsified; otherwise, it is a no-op.
 |      
 |      Returns
 |      -------
 |      self : estimator
 |  
 |  sparsify(self)
 |      Convert coefficient matrix to sparse format.
 |      
 |      Converts the ``coef_`` member to a scipy.sparse matrix, which for
 |      L1-regularized models can be much more memory- and storage-efficient
 |      than the usual numpy.ndarray representation.
 |      
 |      The ``intercept_`` member is not converted.
 |      
 |      Notes
 |      -----
 |      For non-sparse models, i.e. when there are not many zeros in ``coef_``,
 |      this may actually *increase* memory usage, so use this method with
 |      care. A rule of thumb is that the number of zero elements, which can
 |      be computed with ``(coef_ == 0).sum()``, must be more than 50% for this
 |      to provide significant benefits.
 |      
 |      After calling this method, further fitting with the partial_fit
 |      method (if any) will not work until you call densify.
 |      
 |      Returns
 |      -------
 |      self : estimator

In [36]:
# Definimos el modelo
logreg=LogisticRegression(random_state=0, solver='lbfgs') #Se puede fijar un solver
In [37]:
# parámetros del modelo
logreg.get_params()
Out[37]:
{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}
In [46]:
# Definimos X e y
X = dt.drop('Survived',axis=1) # Para que me de un dataframe y no una serie
y=dt['Survived']
In [47]:
# Ajustamos
logreg = logreg.fit(X,y)

Coeficientes del modelo: $\beta$ = coef_, $\alpha$ = intercept_.

$$p=\frac{1}{1+e^{-(\alpha+\beta x)}}$$
In [48]:
print('alpha: ', logreg.intercept_)
print('beta: ', logreg.coef_)
alpha:  [1.01628767]
beta:  [[-2.44597988]]

Odds de la mujer: $e^\alpha$.

In [49]:
np.exp(logreg.intercept_)
Out[49]:
array([2.76291884])
In [50]:
odds_mujer
Out[50]:
2.876543209876542

Odds del hombre: $e^{\alpha+\beta}$

In [51]:
np.exp(logreg.intercept_+logreg.coef_)
Out[51]:
array([[0.23938259]])
In [52]:
odds_hombre
Out[52]:
0.23290598290598288

Odds ratio: $e^\beta$

In [53]:
np.exp(logreg.coef_)
Out[53]:
array([[0.0866412]])
In [56]:
odds_ratio
Out[56]:
0.08096731594585674
In [57]:
# Predecimos
y_pred = logreg.predict(X)

Hagamos la confusion matrix:

In [58]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred,y)
Out[58]:
array([[468, 109],
       [ 81, 233]])

La precisión del modelo la podemos medir con su accuracy. Se puede obtener con .score, que nos evalúa unos datos sobre un modelo y calcula el error o con accuracy_score que calcula la accuracy entre un vector de y reales otro de predichas

In [59]:
logreg.score(X,y)
Out[59]:
0.7867564534231201
In [60]:
from sklearn.metrics import accuracy_score
accuracy_score(y,y_pred)
Out[60]:
0.7867564534231201

Hagamos otro ejemplo con más variables:

In [61]:
dt=titanic[['Survived','Sex','Age','Pclass']]
dt.head()
Out[61]:
Survived Sex Age Pclass
0 0 male 22.0 3
1 1 female 38.0 1
2 1 female 26.0 3
3 1 female 35.0 1
4 0 male 35.0 3
In [62]:
dt.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
Survived    891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Pclass      891 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 27.9+ KB
In [64]:
# Me voy a transformar Pclass en string porque es una variable categórica
dt.Pclass=dt.Pclass.apply(lambda x: str(x))
dt.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
Survived    891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Pclass      891 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 27.9+ KB
In [65]:
# Me saco los dummies de las variables categóricas
dt=pd.get_dummies(dt)
dt.head()
Out[65]:
Survived Age Sex_female Sex_male Pclass_1 Pclass_2 Pclass_3
0 0 22.0 0 1 0 0 1
1 1 38.0 1 0 1 0 0
2 1 26.0 1 0 0 0 1
3 1 35.0 1 0 1 0 0
4 0 35.0 0 1 0 0 1
In [66]:
# Filtramos las clases que sobran
dt=dt.drop('Pclass_3',axis=1)
dt=dt.drop('Sex_female',axis=1)
dt.head()
Out[66]:
Survived Age Sex_male Pclass_1 Pclass_2
0 0 22.0 1 0 0
1 1 38.0 0 1 0
2 1 26.0 0 0 0
3 1 35.0 0 1 0
4 0 35.0 1 0 0

Voy a tratar con los NaN

In [67]:
np.sum(dt.isnull())
Out[67]:
Survived      0
Age         177
Sex_male      0
Pclass_1      0
Pclass_2      0
dtype: int64
In [70]:
# Me quito los NaN
dt=dt.dropna()

Hacemos el modelo:

In [72]:
# Seleccionamos X e y
X=dt.drop('Survived', axis=1)
y=dt['Survived']
In [86]:
# Definir:
logreg= LogisticRegression(random_state=0,solver='lbfgs')

# Ajustar:
logreg=logreg.fit(X,y)

# Predecimos
y_pred=logreg.predict(X)
In [75]:
y_pred
Out[75]:
array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0])

Me ha predicho la variable target como unos y ceros, pero me puede interesar ver cómo lo predice como probabilidades. (Básicamente fija un umbral de 0,5 y devuelve 1 si la probabilidad es mayor que el umbral y 0 en caso contrario).

In [77]:
probs=logreg.predict_proba(X)
probs
Out[77]:
array([[0.88081218, 0.11918782],
       [0.10117928, 0.89882072],
       [0.43713259, 0.56286741],
       ...,
       [0.05569741, 0.94430259],
       [0.44933092, 0.55066908],
       [0.91216125, 0.08783875]])

Podemos sacar la accuracy:

In [81]:
logreg.score(X,y)
Out[81]:
0.7899159663865546

Coeficientes:

In [83]:
logreg.coef_, logreg.intercept_
Out[83]:
(array([[-0.03401702, -2.38901944,  2.33958631,  1.12651064]]),
 array([1.13725005]))

Hay varios coeficientes ($\beta_1, \beta_2, \beta_3, \beta_4$), uno por cada variable y el otro es el $\alpha$.

Por último, hallemos la ROC curve:

In [84]:
from sklearn.metrics import roc_curve, auc
In [88]:
fpr, tpr, threshold = roc_curve(y,probs[:,1])
roc_auc = auc(fpr,tpr)
In [89]:
import matplotlib.pyplot as plt
In [90]:
t=np.arange(0,5,0.2)
In [91]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1],[0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]: