exercise3a-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notebook showing logisitic regression using multiple variables with a little bit of preprocessing\n",
    "#### by Salih MSA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Importing libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import statistics # mean, median, etc.\n",
    "\n",
    "# Data visualisation functionality\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.preprocessing import OneHotEncoder # method to split dataset into 4\n",
    "from sklearn.model_selection import train_test_split # method to split dataset into 4\n",
    "from sklearn.linear_model import LogisticRegression # linear regression algorithm\n",
    "from sklearn.metrics import mean_squared_error, mean_absolute_error # accuracy testing method"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Importing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"breast_cancer.csv\") # import dataset with custom headers, store"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data exploration & Preprocessing\n",
    "### Check for possible issues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method NDFrame._add_numeric_operations.<locals>.sum of 0      False\n",
       "1      False\n",
       "2      False\n",
       "3      False\n",
       "4      False\n",
       "       ...  \n",
       "564    False\n",
       "565    False\n",
       "566    False\n",
       "567    False\n",
       "568    False\n",
       "Length: 569, dtype: bool>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.duplicated().sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 569 entries, 0 to 568\n",
      "Data columns (total 33 columns):\n",
      " #   Column                   Non-Null Count  Dtype  \n",
      "---  ------                   --------------  -----  \n",
      " 0   id                       569 non-null    int64  \n",
      " 1   diagnosis                569 non-null    object \n",
      " 2   radius_mean              569 non-null    float64\n",
      " 3   texture_mean             569 non-null    float64\n",
      " 4   perimeter_mean           569 non-null    float64\n",
      " 5   area_mean                569 non-null    float64\n",
      " 6   smoothness_mean          569 non-null    float64\n",
      " 7   compactness_mean         569 non-null    float64\n",
      " 8   concavity_mean           569 non-null    float64\n",
      " 9   concave points_mean      569 non-null    float64\n",
      " 10  symmetry_mean            569 non-null    float64\n",
      " 11  fractal_dimension_mean   569 non-null    float64\n",
      " 12  radius_se                569 non-null    float64\n",
      " 13  texture_se               569 non-null    float64\n",
      " 14  perimeter_se             569 non-null    float64\n",
      " 15  area_se                  569 non-null    float64\n",
      " 16  smoothness_se            569 non-null    float64\n",
      " 17  compactness_se           569 non-null    float64\n",
      " 18  concavity_se             569 non-null    float64\n",
      " 19  concave points_se        569 non-null    float64\n",
      " 20  symmetry_se              569 non-null    float64\n",
      " 21  fractal_dimension_se     569 non-null    float64\n",
      " 22  radius_worst             569 non-null    float64\n",
      " 23  texture_worst            569 non-null    float64\n",
      " 24  perimeter_worst          569 non-null    float64\n",
      " 25  area_worst               569 non-null    float64\n",
      " 26  smoothness_worst         569 non-null    float64\n",
      " 27  compactness_worst        569 non-null    float64\n",
      " 28  concavity_worst          569 non-null    float64\n",
      " 29  concave points_worst     569 non-null    float64\n",
      " 30  symmetry_worst           569 non-null    float64\n",
      " 31  fractal_dimension_worst  569 non-null    float64\n",
      " 32  Unnamed: 32              0 non-null      float64\n",
      "dtypes: float64(31), int64(1), object(1)\n",
      "memory usage: 146.8+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>diagnosis</th>\n",
       "      <th>radius_mean</th>\n",
       "      <th>texture_mean</th>\n",
       "      <th>perimeter_mean</th>\n",
       "      <th>area_mean</th>\n",
       "      <th>smoothness_mean</th>\n",
       "      <th>compactness_mean</th>\n",
       "      <th>concavity_mean</th>\n",
       "      <th>concave points_mean</th>\n",
       "      <th>...</th>\n",
       "      <th>texture_worst</th>\n",
       "      <th>perimeter_worst</th>\n",
       "      <th>area_worst</th>\n",
       "      <th>smoothness_worst</th>\n",
       "      <th>compactness_worst</th>\n",
       "      <th>concavity_worst</th>\n",
       "      <th>concave points_worst</th>\n",
       "      <th>symmetry_worst</th>\n",
       "      <th>fractal_dimension_worst</th>\n",
       "      <th>Unnamed: 32</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>842302</td>\n",
       "      <td>M</td>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.3001</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>...</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.1622</td>\n",
       "      <td>0.6656</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>842517</td>\n",
       "      <td>M</td>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.0869</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>...</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.1238</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>84300903</td>\n",
       "      <td>M</td>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.1974</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>...</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.1444</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>84348301</td>\n",
       "      <td>M</td>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>...</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.8663</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>84358402</td>\n",
       "      <td>M</td>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.1980</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>...</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.1374</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 33 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \\\n",
       "0    842302         M        17.99         10.38          122.80     1001.0   \n",
       "1    842517         M        20.57         17.77          132.90     1326.0   \n",
       "2  84300903         M        19.69         21.25          130.00     1203.0   \n",
       "3  84348301         M        11.42         20.38           77.58      386.1   \n",
       "4  84358402         M        20.29         14.34          135.10     1297.0   \n",
       "\n",
       "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \\\n",
       "0          0.11840           0.27760          0.3001              0.14710   \n",
       "1          0.08474           0.07864          0.0869              0.07017   \n",
       "2          0.10960           0.15990          0.1974              0.12790   \n",
       "3          0.14250           0.28390          0.2414              0.10520   \n",
       "4          0.10030           0.13280          0.1980              0.10430   \n",
       "\n",
       "   ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \\\n",
       "0  ...          17.33           184.60      2019.0            0.1622   \n",
       "1  ...          23.41           158.80      1956.0            0.1238   \n",
       "2  ...          25.53           152.50      1709.0            0.1444   \n",
       "3  ...          26.50            98.87       567.7            0.2098   \n",
       "4  ...          16.67           152.20      1575.0            0.1374   \n",
       "\n",
       "   compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \\\n",
       "0             0.6656           0.7119                0.2654          0.4601   \n",
       "1             0.1866           0.2416                0.1860          0.2750   \n",
       "2             0.4245           0.4504                0.2430          0.3613   \n",
       "3             0.8663           0.6869                0.2575          0.6638   \n",
       "4             0.2050           0.4000                0.1625          0.2364   \n",
       "\n",
       "   fractal_dimension_worst  Unnamed: 32  \n",
       "0                  0.11890          NaN  \n",
       "1                  0.08902          NaN  \n",
       "2                  0.08758          NaN  \n",
       "3                  0.17300          NaN  \n",
       "4                  0.07678          NaN  \n",
       "\n",
       "[5 rows x 33 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Resolving issues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# There are issues with this dataset - 'diagnosis' (our y object) MUST be an integer\n",
    "# we can simply just create another column and fill that with the correct versions of values (ie 0, 1) & replace the intial column\n",
    "fDiagnosis = pd.get_dummies(data[\"diagnosis\"]) # use 'get_dummies' method converts categorical variable into dummy/indicator variables\n",
    "# note: this creates a column for each categorical, where 1 represents in each column whether a row had that value set or not\n",
    "data[\"diagnosis\"] = fDiagnosis.iloc[:, -1] # replace old w in new column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Another issue - column 'id' & 'Unnamed 32' are unusable - delete them\n",
    "data.drop(columns=\"id\", inplace=True) # remove old column\n",
    "data.drop(columns=\"Unnamed: 32\", inplace=True) # remove old column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>diagnosis</th>\n",
       "      <th>radius_mean</th>\n",
       "      <th>texture_mean</th>\n",
       "      <th>perimeter_mean</th>\n",
       "      <th>area_mean</th>\n",
       "      <th>smoothness_mean</th>\n",
       "      <th>compactness_mean</th>\n",
       "      <th>concavity_mean</th>\n",
       "      <th>concave points_mean</th>\n",
       "      <th>symmetry_mean</th>\n",
       "      <th>...</th>\n",
       "      <th>radius_worst</th>\n",
       "      <th>texture_worst</th>\n",
       "      <th>perimeter_worst</th>\n",
       "      <th>area_worst</th>\n",
       "      <th>smoothness_worst</th>\n",
       "      <th>compactness_worst</th>\n",
       "      <th>concavity_worst</th>\n",
       "      <th>concave points_worst</th>\n",
       "      <th>symmetry_worst</th>\n",
       "      <th>fractal_dimension_worst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.3001</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>0.2419</td>\n",
       "      <td>...</td>\n",
       "      <td>25.38</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.1622</td>\n",
       "      <td>0.6656</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.0869</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>0.1812</td>\n",
       "      <td>...</td>\n",
       "      <td>24.99</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.1238</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.1974</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>0.2069</td>\n",
       "      <td>...</td>\n",
       "      <td>23.57</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.1444</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>0.2597</td>\n",
       "      <td>...</td>\n",
       "      <td>14.91</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.8663</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.1980</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>0.1809</td>\n",
       "      <td>...</td>\n",
       "      <td>22.54</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.1374</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \\\n",
       "0          1        17.99         10.38          122.80     1001.0   \n",
       "1          1        20.57         17.77          132.90     1326.0   \n",
       "2          1        19.69         21.25          130.00     1203.0   \n",
       "3          1        11.42         20.38           77.58      386.1   \n",
       "4          1        20.29         14.34          135.10     1297.0   \n",
       "\n",
       "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \\\n",
       "0          0.11840           0.27760          0.3001              0.14710   \n",
       "1          0.08474           0.07864          0.0869              0.07017   \n",
       "2          0.10960           0.15990          0.1974              0.12790   \n",
       "3          0.14250           0.28390          0.2414              0.10520   \n",
       "4          0.10030           0.13280          0.1980              0.10430   \n",
       "\n",
       "   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \\\n",
       "0         0.2419  ...         25.38          17.33           184.60   \n",
       "1         0.1812  ...         24.99          23.41           158.80   \n",
       "2         0.2069  ...         23.57          25.53           152.50   \n",
       "3         0.2597  ...         14.91          26.50            98.87   \n",
       "4         0.1809  ...         22.54          16.67           152.20   \n",
       "\n",
       "   area_worst  smoothness_worst  compactness_worst  concavity_worst  \\\n",
       "0      2019.0            0.1622             0.6656           0.7119   \n",
       "1      1956.0            0.1238             0.1866           0.2416   \n",
       "2      1709.0            0.1444             0.4245           0.4504   \n",
       "3       567.7            0.2098             0.8663           0.6869   \n",
       "4      1575.0            0.1374             0.2050           0.4000   \n",
       "\n",
       "   concave points_worst  symmetry_worst  fractal_dimension_worst  \n",
       "0                0.2654          0.4601                  0.11890  \n",
       "1                0.1860          0.2750                  0.08902  \n",
       "2                0.2430          0.3613                  0.08758  \n",
       "3                0.2575          0.6638                  0.17300  \n",
       "4                0.1625          0.2364                  0.07678  \n",
       "\n",
       "[5 rows x 31 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now view fixed data\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Learning itself\n",
    "### Split sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x:       texture_mean  perimeter_mean  area_mean  smoothness_mean  \\\n",
      "0           10.38          122.80     1001.0          0.11840   \n",
      "1           17.77          132.90     1326.0          0.08474   \n",
      "2           21.25          130.00     1203.0          0.10960   \n",
      "3           20.38           77.58      386.1          0.14250   \n",
      "4           14.34          135.10     1297.0          0.10030   \n",
      "..            ...             ...        ...              ...   \n",
      "564         22.39          142.00     1479.0          0.11100   \n",
      "565         28.25          131.20     1261.0          0.09780   \n",
      "566         28.08          108.30      858.1          0.08455   \n",
      "567         29.33          140.10     1265.0          0.11780   \n",
      "568         24.54           47.92      181.0          0.05263   \n",
      "\n",
      "     compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \\\n",
      "0             0.27760         0.30010              0.14710         0.2419   \n",
      "1             0.07864         0.08690              0.07017         0.1812   \n",
      "2             0.15990         0.19740              0.12790         0.2069   \n",
      "3             0.28390         0.24140              0.10520         0.2597   \n",
      "4             0.13280         0.19800              0.10430         0.1809   \n",
      "..                ...             ...                  ...            ...   \n",
      "564           0.11590         0.24390              0.13890         0.1726   \n",
      "565           0.10340         0.14400              0.09791         0.1752   \n",
      "566           0.10230         0.09251              0.05302         0.1590   \n",
      "567           0.27700         0.35140              0.15200         0.2397   \n",
      "568           0.04362         0.00000              0.00000         0.1587   \n",
      "\n",
      "     fractal_dimension_mean  radius_se  ...  radius_worst  texture_worst  \\\n",
      "0                   0.07871     1.0950  ...        25.380          17.33   \n",
      "1                   0.05667     0.5435  ...        24.990          23.41   \n",
      "2                   0.05999     0.7456  ...        23.570          25.53   \n",
      "3                   0.09744     0.4956  ...        14.910          26.50   \n",
      "4                   0.05883     0.7572  ...        22.540          16.67   \n",
      "..                      ...        ...  ...           ...            ...   \n",
      "564                 0.05623     1.1760  ...        25.450          26.40   \n",
      "565                 0.05533     0.7655  ...        23.690          38.25   \n",
      "566                 0.05648     0.4564  ...        18.980          34.12   \n",
      "567                 0.07016     0.7260  ...        25.740          39.42   \n",
      "568                 0.05884     0.3857  ...         9.456          30.37   \n",
      "\n",
      "     perimeter_worst  area_worst  smoothness_worst  compactness_worst  \\\n",
      "0             184.60      2019.0           0.16220            0.66560   \n",
      "1             158.80      1956.0           0.12380            0.18660   \n",
      "2             152.50      1709.0           0.14440            0.42450   \n",
      "3              98.87       567.7           0.20980            0.86630   \n",
      "4             152.20      1575.0           0.13740            0.20500   \n",
      "..               ...         ...               ...                ...   \n",
      "564           166.10      2027.0           0.14100            0.21130   \n",
      "565           155.00      1731.0           0.11660            0.19220   \n",
      "566           126.70      1124.0           0.11390            0.30940   \n",
      "567           184.60      1821.0           0.16500            0.86810   \n",
      "568            59.16       268.6           0.08996            0.06444   \n",
      "\n",
      "     concavity_worst  concave points_worst  symmetry_worst  \\\n",
      "0             0.7119                0.2654          0.4601   \n",
      "1             0.2416                0.1860          0.2750   \n",
      "2             0.4504                0.2430          0.3613   \n",
      "3             0.6869                0.2575          0.6638   \n",
      "4             0.4000                0.1625          0.2364   \n",
      "..               ...                   ...             ...   \n",
      "564           0.4107                0.2216          0.2060   \n",
      "565           0.3215                0.1628          0.2572   \n",
      "566           0.3403                0.1418          0.2218   \n",
      "567           0.9387                0.2650          0.4087   \n",
      "568           0.0000                0.0000          0.2871   \n",
      "\n",
      "     fractal_dimension_worst  \n",
      "0                    0.11890  \n",
      "1                    0.08902  \n",
      "2                    0.08758  \n",
      "3                    0.17300  \n",
      "4                    0.07678  \n",
      "..                       ...  \n",
      "564                  0.07115  \n",
      "565                  0.06637  \n",
      "566                  0.07820  \n",
      "567                  0.12400  \n",
      "568                  0.07039  \n",
      "\n",
      "[569 rows x 29 columns]\n",
      "y:  0      1\n",
      "1      1\n",
      "2      1\n",
      "3      1\n",
      "4      1\n",
      "      ..\n",
      "564    1\n",
      "565    1\n",
      "566    1\n",
      "567    1\n",
      "568    0\n",
      "Name: diagnosis, Length: 569, dtype: uint8\n"
     ]
    }
   ],
   "source": [
    "x = data.iloc[:, 2:].values # values we want to classify - we only want\n",
    "print(\"x: \", data.iloc[:, 2:])\n",
    "y = data.iloc[:, 0].values # acceptances for each row (ie either benign (0) or malignant (1))\n",
    "print(\"y: \", data.iloc[:, 0])\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=0) # split dataset into train, test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LogisticRegression(max_iter=20000)\n",
    "model.fit(x_train, y_train)\n",
    "predictions = model.predict(x_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test accuracy using testing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Predicted values:</th>\n",
       "      <th>Actual values</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Predicted values:  Actual values\n",
       "0                  1              1\n",
       "1                  0              0\n",
       "2                  0              0\n",
       "3                  0              0\n",
       "4                  0              0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test_pred = model.predict(x_test) # based on our model, give it values to try to predict with\n",
    "pred_vs_actual = pd.DataFrame({\"Predicted values:\": y_test_pred, \"Actual values\": y_test})\n",
    "pred_vs_actual.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error: 0.04895104895104895\n",
      "Mean absolute error: 1.8251748251748252\n",
      "Model accuracy: 0.951\n"
     ]
    }
   ],
   "source": [
    "print(\"Mean squared error:\", mean_squared_error(y_test, y_test_pred))\n",
    "print(\"Mean absolute error:\", mean_absolute_error(y_test, y_test_pred))\n",
    "accuracy = model.score(x_test, y_test) # or simply called score method to use the models inherent predictions vs a dataset / subset we give it \n",
    "print(\"Model accuracy: {:.3f}\".format(accuracy)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}