exercise3b-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notebook showing more logisitic regression using multiple variables involving lots of preprocessing\n",
    "#### by Salih MSA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Importing libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import statistics # mean, median, etc.\n",
    "\n",
    "# Data visualisation functionality\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.preprocessing import OneHotEncoder # method to preprocess data (specifically converting columns->categorical datas)\n",
    "from sklearn.model_selection import train_test_split # method to split dataset into 4\n",
    "from sklearn.linear_model import LogisticRegression # linear regression algorithm\n",
    "from sklearn.metrics import mean_squared_error, mean_absolute_error # accuracy testing method"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Importing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"titanic.csv\") # import dataset with custom headers, store"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data exploration & Preprocessing\n",
    "### Check for possible issues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method NDFrame._add_numeric_operations.<locals>.sum of      PassengerId  Survived  Pclass   Name    Sex    Age  SibSp  Parch  Ticket  \\\n",
       "0          False     False   False  False  False  False  False  False   False   \n",
       "1          False     False   False  False  False  False  False  False   False   \n",
       "2          False     False   False  False  False  False  False  False   False   \n",
       "3          False     False   False  False  False  False  False  False   False   \n",
       "4          False     False   False  False  False  False  False  False   False   \n",
       "..           ...       ...     ...    ...    ...    ...    ...    ...     ...   \n",
       "886        False     False   False  False  False  False  False  False   False   \n",
       "887        False     False   False  False  False  False  False  False   False   \n",
       "888        False     False   False  False  False   True  False  False   False   \n",
       "889        False     False   False  False  False  False  False  False   False   \n",
       "890        False     False   False  False  False  False  False  False   False   \n",
       "\n",
       "      Fare  Cabin  Embarked  \n",
       "0    False   True     False  \n",
       "1    False  False     False  \n",
       "2    False   True     False  \n",
       "3    False  False     False  \n",
       "4    False   True     False  \n",
       "..     ...    ...       ...  \n",
       "886  False   True     False  \n",
       "887  False  False     False  \n",
       "888  False   True     False  \n",
       "889  False  False     False  \n",
       "890  False   True     False  \n",
       "\n",
       "[891 rows x 12 columns]>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isnull().sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method NDFrame._add_numeric_operations.<locals>.sum of 0      False\n",
       "1      False\n",
       "2      False\n",
       "3      False\n",
       "4      False\n",
       "       ...  \n",
       "886    False\n",
       "887    False\n",
       "888    False\n",
       "889    False\n",
       "890    False\n",
       "Length: 891, dtype: bool>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.duplicated().sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  891 non-null    int64  \n",
      " 1   Survived     891 non-null    int64  \n",
      " 2   Pclass       891 non-null    int64  \n",
      " 3   Name         891 non-null    object \n",
      " 4   Sex          891 non-null    object \n",
      " 5   Age          714 non-null    float64\n",
      " 6   SibSp        891 non-null    int64  \n",
      " 7   Parch        891 non-null    int64  \n",
      " 8   Ticket       891 non-null    object \n",
      " 9   Fare         891 non-null    float64\n",
      " 10  Cabin        204 non-null    object \n",
      " 11  Embarked     889 non-null    object \n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>887</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>211536</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>888</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>female</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>112053</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>B42</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>889</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>W./C. 6607</td>\n",
       "      <td>23.4500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>890</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>male</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>111369</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>C148</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>891</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "      <td>male</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>370376</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass  \\\n",
       "0              1         0       3   \n",
       "1              2         1       1   \n",
       "2              3         1       3   \n",
       "3              4         1       1   \n",
       "4              5         0       3   \n",
       "..           ...       ...     ...   \n",
       "886          887         0       2   \n",
       "887          888         1       1   \n",
       "888          889         0       3   \n",
       "889          890         1       1   \n",
       "890          891         0       3   \n",
       "\n",
       "                                                  Name     Sex   Age  SibSp  \\\n",
       "0                              Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                               Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                             Allen, Mr. William Henry    male  35.0      0   \n",
       "..                                                 ...     ...   ...    ...   \n",
       "886                              Montvila, Rev. Juozas    male  27.0      0   \n",
       "887                       Graham, Miss. Margaret Edith  female  19.0      0   \n",
       "888           Johnston, Miss. Catherine Helen \"Carrie\"  female   NaN      1   \n",
       "889                              Behr, Mr. Karl Howell    male  26.0      0   \n",
       "890                                Dooley, Mr. Patrick    male  32.0      0   \n",
       "\n",
       "     Parch            Ticket     Fare Cabin Embarked  \n",
       "0        0         A/5 21171   7.2500   NaN        S  \n",
       "1        0          PC 17599  71.2833   C85        C  \n",
       "2        0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3        0            113803  53.1000  C123        S  \n",
       "4        0            373450   8.0500   NaN        S  \n",
       "..     ...               ...      ...   ...      ...  \n",
       "886      0            211536  13.0000   NaN        S  \n",
       "887      0            112053  30.0000   B42        S  \n",
       "888      2        W./C. 6607  23.4500   NaN        S  \n",
       "889      0            111369  30.0000  C148        C  \n",
       "890      0            370376   7.7500   NaN        Q  \n",
       "\n",
       "[891 rows x 12 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Resolving issues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# turn any non numerical (be that string or conti. values) categories -> numerical ones\n",
    "# we can simply just create another column and fill that with the correct versions of values (ie 0, 1) & replace the intial column\n",
    "fSex = pd.get_dummies(data[\"Sex\"]) # use 'get_dummies' method converts categorical variable into dummy/indicator variables\n",
    "data[\"Sex\"] = fSex.iloc[:, -1] # replace old w in new column\n",
    "fEmbarked = pd.get_dummies(data[\"Embarked\"]) # use 'get_dummies' method converts categorical variable into dummy/indicator variables\n",
    "data[\"Embarked\"] = fEmbarked.iloc[:, -1] # replace old w in new column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Another issue - many columns are irrelevant - delete them\n",
    "data.drop(columns=\"PassengerId\", inplace=True) # remove old column\n",
    "data.drop(columns=\"Name\", inplace=True) # remove old column\n",
    "data.drop(columns=\"Ticket\", inplace=True) # remove old column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Some columns contain too many NaN's (IEEE NotANumber) to make it non-viable to delete the rows afflicted - delete them too\n",
    "data.drop(columns=\"Cabin\", inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now drop the rows containing NaN's\n",
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>885</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>29.1250</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>714 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked\n",
       "0           0       3    1  22.0      1      0   7.2500         1\n",
       "1           1       1    0  38.0      1      0  71.2833         0\n",
       "2           1       3    0  26.0      0      0   7.9250         1\n",
       "3           1       1    0  35.0      1      0  53.1000         1\n",
       "4           0       3    1  35.0      0      0   8.0500         1\n",
       "..        ...     ...  ...   ...    ...    ...      ...       ...\n",
       "885         0       3    0  39.0      0      5  29.1250         0\n",
       "886         0       2    1  27.0      0      0  13.0000         1\n",
       "887         1       1    0  19.0      0      0  30.0000         1\n",
       "889         1       1    1  26.0      0      0  30.0000         0\n",
       "890         0       3    1  32.0      0      0   7.7500         0\n",
       "\n",
       "[714 rows x 8 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now view fixed data\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Learning itself\n",
    "### Split sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x: [[ 1.     22.      1.      0.      7.25    1.    ]\n",
      " [ 0.     38.      1.      0.     71.2833  0.    ]\n",
      " [ 0.     26.      0.      0.      7.925   1.    ]\n",
      " ...\n",
      " [ 0.     19.      0.      0.     30.      1.    ]\n",
      " [ 1.     26.      0.      0.     30.      0.    ]\n",
      " [ 1.     32.      0.      0.      7.75    0.    ]]\n",
      "y: 0      0\n",
      "1      1\n",
      "2      1\n",
      "3      1\n",
      "4      0\n",
      "      ..\n",
      "885    0\n",
      "886    0\n",
      "887    1\n",
      "889    1\n",
      "890    0\n",
      "Name: Survived, Length: 714, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "x = data.iloc[:, 2:].values # values we want to classify - we only want\n",
    "print(\"x:\", x)\n",
    "y = data.iloc[:, 0].values # acceptances for each row (ie either benign (0) or malignant (1))\n",
    "print(\"y:\", data.iloc[:, 0])\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=0) # split dataset into train, test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LogisticRegression(max_iter=20000)\n",
    "model.fit(x_train, y_train)\n",
    "predictions = model.predict(x_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test accuracy using testing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Predicted values:</th>\n",
       "      <th>Actual values</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>174</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>177</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>178</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>179 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Predicted values:  Actual values\n",
       "0                    1              0\n",
       "1                    1              0\n",
       "2                    1              1\n",
       "3                    0              0\n",
       "4                    0              1\n",
       "..                 ...            ...\n",
       "174                  0              0\n",
       "175                  0              0\n",
       "176                  1              1\n",
       "177                  1              1\n",
       "178                  0              0\n",
       "\n",
       "[179 rows x 2 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test_pred = model.predict(x_test) # based on our model, give it values to try to predict with\n",
    "pred_vs_actual = pd.DataFrame({\"Predicted values:\": y_test_pred, \"Actual values\": y_test})\n",
    "pred_vs_actual"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error: 0.22905027932960895\n",
      "Mean absolute error: 0.22905027932960895\n",
      "Model accuracy: 0.771\n"
     ]
    }
   ],
   "source": [
    "print(\"Mean squared error:\", mean_squared_error(y_test, y_test_pred))\n",
    "print(\"Mean absolute error:\", mean_absolute_error(y_test, y_test_pred))\n",
    "accuracy = model.score(x_test, y_test) # or simply called score method to use the models inherent predictions vs a dataset / subset we give it \n",
    "print(\"Model accuracy: {:.3f}\".format(accuracy)) "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Notebook showing more logisitic regression using multiple variables involving lots of preprocessing\n",
	"#### by Salih MSA"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Importing"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Importing libraries"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"import statistics # mean, median, etc.\n",
	"\n",
	"# Data visualisation functionality\n",
	"import matplotlib.pyplot as plt\n",
	"%matplotlib inline\n",
	"import seaborn as sns\n",
	"\n",
	"from sklearn.preprocessing import OneHotEncoder # method to preprocess data (specifically converting columns->categorical datas)\n",
	"from sklearn.model_selection import train_test_split # method to split dataset into 4\n",
	"from sklearn.linear_model import LogisticRegression # linear regression algorithm\n",
	"from sklearn.metrics import mean_squared_error, mean_absolute_error # accuracy testing method"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Importing data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = pd.read_csv(\"titanic.csv\") # import dataset with custom headers, store"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Data exploration & Preprocessing\n",
	"### Check for possible issues"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<bound method NDFrame._add_numeric_operations.<locals>.sum of PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket \\\n",
	"0 False False False False False False False False False \n",
	"1 False False False False False False False False False \n",
	"2 False False False False False False False False False \n",
	"3 False False False False False False False False False \n",
	"4 False False False False False False False False False \n",
	".. ... ... ... ... ... ... ... ... ... \n",
	"886 False False False False False False False False False \n",
	"887 False False False False False False False False False \n",
	"888 False False False False False True False False False \n",
	"889 False False False False False False False False False \n",
	"890 False False False False False False False False False \n",
	"\n",
	" Fare Cabin Embarked \n",
	"0 False True False \n",
	"1 False False False \n",
	"2 False True False \n",
	"3 False False False \n",
	"4 False True False \n",
	".. ... ... ... \n",
	"886 False True False \n",
	"887 False False False \n",
	"888 False True False \n",
	"889 False False False \n",
	"890 False True False \n",
	"\n",
	"[891 rows x 12 columns]>"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data.isnull().sum"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<bound method NDFrame._add_numeric_operations.<locals>.sum of 0 False\n",
	"1 False\n",
	"2 False\n",
	"3 False\n",
	"4 False\n",
	" ... \n",
	"886 False\n",
	"887 False\n",
	"888 False\n",
	"889 False\n",
	"890 False\n",
	"Length: 891, dtype: bool>"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data.duplicated().sum"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"RangeIndex: 891 entries, 0 to 890\n",
	"Data columns (total 12 columns):\n",
	" # Column Non-Null Count Dtype \n",
	"--- ------ -------------- ----- \n",
	" 0 PassengerId 891 non-null int64 \n",
	" 1 Survived 891 non-null int64 \n",
	" 2 Pclass 891 non-null int64 \n",
	" 3 Name 891 non-null object \n",
	" 4 Sex 891 non-null object \n",
	" 5 Age 714 non-null float64\n",
	" 6 SibSp 891 non-null int64 \n",
	" 7 Parch 891 non-null int64 \n",
	" 8 Ticket 891 non-null object \n",
	" 9 Fare 891 non-null float64\n",
	" 10 Cabin 204 non-null object \n",
	" 11 Embarked 889 non-null object \n",
	"dtypes: float64(2), int64(5), object(5)\n",
	"memory usage: 83.7+ KB\n"
	]
	}
	],
	"source": [
	"data.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>PassengerId</th>\n",
	" <th>Survived</th>\n",
	" <th>Pclass</th>\n",
	" <th>Name</th>\n",
	" <th>Sex</th>\n",
	" <th>Age</th>\n",
	" <th>SibSp</th>\n",
	" <th>Parch</th>\n",
	" <th>Ticket</th>\n",
	" <th>Fare</th>\n",
	" <th>Cabin</th>\n",
	" <th>Embarked</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>Braund, Mr. Owen Harris</td>\n",
	" <td>male</td>\n",
	" <td>22.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>A/5 21171</td>\n",
	" <td>7.2500</td>\n",
	" <td>NaN</td>\n",
	" <td>S</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
	" <td>female</td>\n",
	" <td>38.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>PC 17599</td>\n",
	" <td>71.2833</td>\n",
	" <td>C85</td>\n",
	" <td>C</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3</td>\n",
	" <td>1</td>\n",
	" <td>3</td>\n",
	" <td>Heikkinen, Miss. Laina</td>\n",
	" <td>female</td>\n",
	" <td>26.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>STON/O2. 3101282</td>\n",
	" <td>7.9250</td>\n",
	" <td>NaN</td>\n",
	" <td>S</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>4</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
	" <td>female</td>\n",
	" <td>35.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>113803</td>\n",
	" <td>53.1000</td>\n",
	" <td>C123</td>\n",
	" <td>S</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>5</td>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>Allen, Mr. William Henry</td>\n",
	" <td>male</td>\n",
	" <td>35.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>373450</td>\n",
	" <td>8.0500</td>\n",
	" <td>NaN</td>\n",
	" <td>S</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>886</th>\n",
	" <td>887</td>\n",
	" <td>0</td>\n",
	" <td>2</td>\n",
	" <td>Montvila, Rev. Juozas</td>\n",
	" <td>male</td>\n",
	" <td>27.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>211536</td>\n",
	" <td>13.0000</td>\n",
	" <td>NaN</td>\n",
	" <td>S</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>887</th>\n",
	" <td>888</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>Graham, Miss. Margaret Edith</td>\n",
	" <td>female</td>\n",
	" <td>19.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>112053</td>\n",
	" <td>30.0000</td>\n",
	" <td>B42</td>\n",
	" <td>S</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>888</th>\n",
	" <td>889</td>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
	" <td>female</td>\n",
	" <td>NaN</td>\n",
	" <td>1</td>\n",
	" <td>2</td>\n",
	" <td>W./C. 6607</td>\n",
	" <td>23.4500</td>\n",
	" <td>NaN</td>\n",
	" <td>S</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>889</th>\n",
	" <td>890</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>Behr, Mr. Karl Howell</td>\n",
	" <td>male</td>\n",
	" <td>26.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>111369</td>\n",
	" <td>30.0000</td>\n",
	" <td>C148</td>\n",
	" <td>C</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>890</th>\n",
	" <td>891</td>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>Dooley, Mr. Patrick</td>\n",
	" <td>male</td>\n",
	" <td>32.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>370376</td>\n",
	" <td>7.7500</td>\n",
	" <td>NaN</td>\n",
	" <td>Q</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>891 rows × 12 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" PassengerId Survived Pclass \\\n",
	"0 1 0 3 \n",
	"1 2 1 1 \n",
	"2 3 1 3 \n",
	"3 4 1 1 \n",
	"4 5 0 3 \n",
	".. ... ... ... \n",
	"886 887 0 2 \n",
	"887 888 1 1 \n",
	"888 889 0 3 \n",
	"889 890 1 1 \n",
	"890 891 0 3 \n",
	"\n",
	" Name Sex Age SibSp \\\n",
	"0 Braund, Mr. Owen Harris male 22.0 1 \n",
	"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
	"2 Heikkinen, Miss. Laina female 26.0 0 \n",
	"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
	"4 Allen, Mr. William Henry male 35.0 0 \n",
	".. ... ... ... ... \n",
	"886 Montvila, Rev. Juozas male 27.0 0 \n",
	"887 Graham, Miss. Margaret Edith female 19.0 0 \n",
	"888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n",
	"889 Behr, Mr. Karl Howell male 26.0 0 \n",
	"890 Dooley, Mr. Patrick male 32.0 0 \n",
	"\n",
	" Parch Ticket Fare Cabin Embarked \n",
	"0 0 A/5 21171 7.2500 NaN S \n",
	"1 0 PC 17599 71.2833 C85 C \n",
	"2 0 STON/O2. 3101282 7.9250 NaN S \n",
	"3 0 113803 53.1000 C123 S \n",
	"4 0 373450 8.0500 NaN S \n",
	".. ... ... ... ... ... \n",
	"886 0 211536 13.0000 NaN S \n",
	"887 0 112053 30.0000 B42 S \n",
	"888 2 W./C. 6607 23.4500 NaN S \n",
	"889 0 111369 30.0000 C148 C \n",
	"890 0 370376 7.7500 NaN Q \n",
	"\n",
	"[891 rows x 12 columns]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Resolving issues"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# turn any non numerical (be that string or conti. values) categories -> numerical ones\n",
	"# we can simply just create another column and fill that with the correct versions of values (ie 0, 1) & replace the intial column\n",
	"fSex = pd.get_dummies(data[\"Sex\"]) # use 'get_dummies' method converts categorical variable into dummy/indicator variables\n",
	"data[\"Sex\"] = fSex.iloc[:, -1] # replace old w in new column\n",
	"fEmbarked = pd.get_dummies(data[\"Embarked\"]) # use 'get_dummies' method converts categorical variable into dummy/indicator variables\n",
	"data[\"Embarked\"] = fEmbarked.iloc[:, -1] # replace old w in new column"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Another issue - many columns are irrelevant - delete them\n",
	"data.drop(columns=\"PassengerId\", inplace=True) # remove old column\n",
	"data.drop(columns=\"Name\", inplace=True) # remove old column\n",
	"data.drop(columns=\"Ticket\", inplace=True) # remove old column"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Some columns contain too many NaN's (IEEE NotANumber) to make it non-viable to delete the rows afflicted - delete them too\n",
	"data.drop(columns=\"Cabin\", inplace=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"# now drop the rows containing NaN's\n",
	"data = data.dropna()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Survived</th>\n",
	" <th>Pclass</th>\n",
	" <th>Sex</th>\n",
	" <th>Age</th>\n",
	" <th>SibSp</th>\n",
	" <th>Parch</th>\n",
	" <th>Fare</th>\n",
	" <th>Embarked</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>1</td>\n",
	" <td>22.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>7.2500</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>38.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>71.2833</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1</td>\n",
	" <td>3</td>\n",
	" <td>0</td>\n",
	" <td>26.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>7.9250</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>35.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>53.1000</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>1</td>\n",
	" <td>35.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>8.0500</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>885</th>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>0</td>\n",
	" <td>39.0</td>\n",
	" <td>0</td>\n",
	" <td>5</td>\n",
	" <td>29.1250</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>886</th>\n",
	" <td>0</td>\n",
	" <td>2</td>\n",
	" <td>1</td>\n",
	" <td>27.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>13.0000</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>887</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>19.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>30.0000</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>889</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>26.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>30.0000</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>890</th>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>1</td>\n",
	" <td>32.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>7.7500</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>714 rows × 8 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" Survived Pclass Sex Age SibSp Parch Fare Embarked\n",
	"0 0 3 1 22.0 1 0 7.2500 1\n",
	"1 1 1 0 38.0 1 0 71.2833 0\n",
	"2 1 3 0 26.0 0 0 7.9250 1\n",
	"3 1 1 0 35.0 1 0 53.1000 1\n",
	"4 0 3 1 35.0 0 0 8.0500 1\n",
	".. ... ... ... ... ... ... ... ...\n",
	"885 0 3 0 39.0 0 5 29.1250 0\n",
	"886 0 2 1 27.0 0 0 13.0000 1\n",
	"887 1 1 0 19.0 0 0 30.0000 1\n",
	"889 1 1 1 26.0 0 0 30.0000 0\n",
	"890 0 3 1 32.0 0 0 7.7500 0\n",
	"\n",
	"[714 rows x 8 columns]"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Now view fixed data\n",
	"data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Learning itself\n",
	"### Split sets"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"x: [[ 1. 22. 1. 0. 7.25 1. ]\n",
	" [ 0. 38. 1. 0. 71.2833 0. ]\n",
	" [ 0. 26. 0. 0. 7.925 1. ]\n",
	" ...\n",
	" [ 0. 19. 0. 0. 30. 1. ]\n",
	" [ 1. 26. 0. 0. 30. 0. ]\n",
	" [ 1. 32. 0. 0. 7.75 0. ]]\n",
	"y: 0 0\n",
	"1 1\n",
	"2 1\n",
	"3 1\n",
	"4 0\n",
	" ..\n",
	"885 0\n",
	"886 0\n",
	"887 1\n",
	"889 1\n",
	"890 0\n",
	"Name: Survived, Length: 714, dtype: int64\n"
	]
	}
	],
	"source": [
	"x = data.iloc[:, 2:].values # values we want to classify - we only want\n",
	"print(\"x:\", x)\n",
	"y = data.iloc[:, 0].values # acceptances for each row (ie either benign (0) or malignant (1))\n",
	"print(\"y:\", data.iloc[:, 0])\n",
	"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=0) # split dataset into train, test"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Train model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"model = LogisticRegression(max_iter=20000)\n",
	"model.fit(x_train, y_train)\n",
	"predictions = model.predict(x_test)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Test accuracy using testing values"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Predicted values:</th>\n",
	" <th>Actual values</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>174</th>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>175</th>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>176</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>177</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>178</th>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>179 rows × 2 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" Predicted values: Actual values\n",
	"0 1 0\n",
	"1 1 0\n",
	"2 1 1\n",
	"3 0 0\n",
	"4 0 1\n",
	".. ... ...\n",
	"174 0 0\n",
	"175 0 0\n",
	"176 1 1\n",
	"177 1 1\n",
	"178 0 0\n",
	"\n",
	"[179 rows x 2 columns]"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"y_test_pred = model.predict(x_test) # based on our model, give it values to try to predict with\n",
	"pred_vs_actual = pd.DataFrame({\"Predicted values:\": y_test_pred, \"Actual values\": y_test})\n",
	"pred_vs_actual"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Mean squared error: 0.22905027932960895\n",
	"Mean absolute error: 0.22905027932960895\n",
	"Model accuracy: 0.771\n"
	]
	}
	],
	"source": [
	"print(\"Mean squared error:\", mean_squared_error(y_test, y_test_pred))\n",
	"print(\"Mean absolute error:\", mean_absolute_error(y_test, y_test_pred))\n",
	"accuracy = model.score(x_test, y_test) # or simply called score method to use the models inherent predictions vs a dataset / subset we give it \n",
	"print(\"Model accuracy: {:.3f}\".format(accuracy)) "
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}