Skip to content
Permalink
8328ef52ee
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
911 lines (911 sloc) 32.7 KB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook showing logisitic regression using multiple variables with a little bit of preprocessing\n",
"#### by Salih MSA"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Importing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Importing libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import statistics # mean, median, etc.\n",
"\n",
"# Data visualisation functionality\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"\n",
"from sklearn.preprocessing import OneHotEncoder # method to split dataset into 4\n",
"from sklearn.model_selection import train_test_split # method to split dataset into 4\n",
"from sklearn.linear_model import LogisticRegression # linear regression algorithm\n",
"from sklearn.metrics import mean_squared_error, mean_absolute_error # accuracy testing method"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Importing data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\"breast_cancer.csv\") # import dataset with custom headers, store"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data exploration & Preprocessing\n",
"### Check for possible issues"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method NDFrame._add_numeric_operations.<locals>.sum of 0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"564 False\n",
"565 False\n",
"566 False\n",
"567 False\n",
"568 False\n",
"Length: 569, dtype: bool>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.duplicated().sum"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 569 entries, 0 to 568\n",
"Data columns (total 33 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 569 non-null int64 \n",
" 1 diagnosis 569 non-null object \n",
" 2 radius_mean 569 non-null float64\n",
" 3 texture_mean 569 non-null float64\n",
" 4 perimeter_mean 569 non-null float64\n",
" 5 area_mean 569 non-null float64\n",
" 6 smoothness_mean 569 non-null float64\n",
" 7 compactness_mean 569 non-null float64\n",
" 8 concavity_mean 569 non-null float64\n",
" 9 concave points_mean 569 non-null float64\n",
" 10 symmetry_mean 569 non-null float64\n",
" 11 fractal_dimension_mean 569 non-null float64\n",
" 12 radius_se 569 non-null float64\n",
" 13 texture_se 569 non-null float64\n",
" 14 perimeter_se 569 non-null float64\n",
" 15 area_se 569 non-null float64\n",
" 16 smoothness_se 569 non-null float64\n",
" 17 compactness_se 569 non-null float64\n",
" 18 concavity_se 569 non-null float64\n",
" 19 concave points_se 569 non-null float64\n",
" 20 symmetry_se 569 non-null float64\n",
" 21 fractal_dimension_se 569 non-null float64\n",
" 22 radius_worst 569 non-null float64\n",
" 23 texture_worst 569 non-null float64\n",
" 24 perimeter_worst 569 non-null float64\n",
" 25 area_worst 569 non-null float64\n",
" 26 smoothness_worst 569 non-null float64\n",
" 27 compactness_worst 569 non-null float64\n",
" 28 concavity_worst 569 non-null float64\n",
" 29 concave points_worst 569 non-null float64\n",
" 30 symmetry_worst 569 non-null float64\n",
" 31 fractal_dimension_worst 569 non-null float64\n",
" 32 Unnamed: 32 0 non-null float64\n",
"dtypes: float64(31), int64(1), object(1)\n",
"memory usage: 146.8+ KB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>diagnosis</th>\n",
" <th>radius_mean</th>\n",
" <th>texture_mean</th>\n",
" <th>perimeter_mean</th>\n",
" <th>area_mean</th>\n",
" <th>smoothness_mean</th>\n",
" <th>compactness_mean</th>\n",
" <th>concavity_mean</th>\n",
" <th>concave points_mean</th>\n",
" <th>...</th>\n",
" <th>texture_worst</th>\n",
" <th>perimeter_worst</th>\n",
" <th>area_worst</th>\n",
" <th>smoothness_worst</th>\n",
" <th>compactness_worst</th>\n",
" <th>concavity_worst</th>\n",
" <th>concave points_worst</th>\n",
" <th>symmetry_worst</th>\n",
" <th>fractal_dimension_worst</th>\n",
" <th>Unnamed: 32</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>842302</td>\n",
" <td>M</td>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0.27760</td>\n",
" <td>0.3001</td>\n",
" <td>0.14710</td>\n",
" <td>...</td>\n",
" <td>17.33</td>\n",
" <td>184.60</td>\n",
" <td>2019.0</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.11890</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>842517</td>\n",
" <td>M</td>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>...</td>\n",
" <td>23.41</td>\n",
" <td>158.80</td>\n",
" <td>1956.0</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.1860</td>\n",
" <td>0.2750</td>\n",
" <td>0.08902</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>84300903</td>\n",
" <td>M</td>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0.15990</td>\n",
" <td>0.1974</td>\n",
" <td>0.12790</td>\n",
" <td>...</td>\n",
" <td>25.53</td>\n",
" <td>152.50</td>\n",
" <td>1709.0</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.2430</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>84348301</td>\n",
" <td>M</td>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0.28390</td>\n",
" <td>0.2414</td>\n",
" <td>0.10520</td>\n",
" <td>...</td>\n",
" <td>26.50</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.17300</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>84358402</td>\n",
" <td>M</td>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0.13280</td>\n",
" <td>0.1980</td>\n",
" <td>0.10430</td>\n",
" <td>...</td>\n",
" <td>16.67</td>\n",
" <td>152.20</td>\n",
" <td>1575.0</td>\n",
" <td>0.1374</td>\n",
" <td>0.2050</td>\n",
" <td>0.4000</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 33 columns</p>\n",
"</div>"
],
"text/plain": [
" id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n",
"0 842302 M 17.99 10.38 122.80 1001.0 \n",
"1 842517 M 20.57 17.77 132.90 1326.0 \n",
"2 84300903 M 19.69 21.25 130.00 1203.0 \n",
"3 84348301 M 11.42 20.38 77.58 386.1 \n",
"4 84358402 M 20.29 14.34 135.10 1297.0 \n",
"\n",
" smoothness_mean compactness_mean concavity_mean concave points_mean \\\n",
"0 0.11840 0.27760 0.3001 0.14710 \n",
"1 0.08474 0.07864 0.0869 0.07017 \n",
"2 0.10960 0.15990 0.1974 0.12790 \n",
"3 0.14250 0.28390 0.2414 0.10520 \n",
"4 0.10030 0.13280 0.1980 0.10430 \n",
"\n",
" ... texture_worst perimeter_worst area_worst smoothness_worst \\\n",
"0 ... 17.33 184.60 2019.0 0.1622 \n",
"1 ... 23.41 158.80 1956.0 0.1238 \n",
"2 ... 25.53 152.50 1709.0 0.1444 \n",
"3 ... 26.50 98.87 567.7 0.2098 \n",
"4 ... 16.67 152.20 1575.0 0.1374 \n",
"\n",
" compactness_worst concavity_worst concave points_worst symmetry_worst \\\n",
"0 0.6656 0.7119 0.2654 0.4601 \n",
"1 0.1866 0.2416 0.1860 0.2750 \n",
"2 0.4245 0.4504 0.2430 0.3613 \n",
"3 0.8663 0.6869 0.2575 0.6638 \n",
"4 0.2050 0.4000 0.1625 0.2364 \n",
"\n",
" fractal_dimension_worst Unnamed: 32 \n",
"0 0.11890 NaN \n",
"1 0.08902 NaN \n",
"2 0.08758 NaN \n",
"3 0.17300 NaN \n",
"4 0.07678 NaN \n",
"\n",
"[5 rows x 33 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Resolving issues"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# There are issues with this dataset - 'diagnosis' (our y object) MUST be an integer\n",
"# we can simply just create another column and fill that with the correct versions of values (ie 0, 1) & replace the intial column\n",
"fDiagnosis = pd.get_dummies(data[\"diagnosis\"]) # use 'get_dummies' method converts categorical variable into dummy/indicator variables\n",
"# note: this creates a column for each categorical, where 1 represents in each column whether a row had that value set or not\n",
"data[\"diagnosis\"] = fDiagnosis.iloc[:, -1] # replace old w in new column"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Another issue - column 'id' & 'Unnamed 32' are unusable - delete them\n",
"data.drop(columns=\"id\", inplace=True) # remove old column\n",
"data.drop(columns=\"Unnamed: 32\", inplace=True) # remove old column"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>diagnosis</th>\n",
" <th>radius_mean</th>\n",
" <th>texture_mean</th>\n",
" <th>perimeter_mean</th>\n",
" <th>area_mean</th>\n",
" <th>smoothness_mean</th>\n",
" <th>compactness_mean</th>\n",
" <th>concavity_mean</th>\n",
" <th>concave points_mean</th>\n",
" <th>symmetry_mean</th>\n",
" <th>...</th>\n",
" <th>radius_worst</th>\n",
" <th>texture_worst</th>\n",
" <th>perimeter_worst</th>\n",
" <th>area_worst</th>\n",
" <th>smoothness_worst</th>\n",
" <th>compactness_worst</th>\n",
" <th>concavity_worst</th>\n",
" <th>concave points_worst</th>\n",
" <th>symmetry_worst</th>\n",
" <th>fractal_dimension_worst</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0.27760</td>\n",
" <td>0.3001</td>\n",
" <td>0.14710</td>\n",
" <td>0.2419</td>\n",
" <td>...</td>\n",
" <td>25.38</td>\n",
" <td>17.33</td>\n",
" <td>184.60</td>\n",
" <td>2019.0</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.11890</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>0.1812</td>\n",
" <td>...</td>\n",
" <td>24.99</td>\n",
" <td>23.41</td>\n",
" <td>158.80</td>\n",
" <td>1956.0</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.1860</td>\n",
" <td>0.2750</td>\n",
" <td>0.08902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0.15990</td>\n",
" <td>0.1974</td>\n",
" <td>0.12790</td>\n",
" <td>0.2069</td>\n",
" <td>...</td>\n",
" <td>23.57</td>\n",
" <td>25.53</td>\n",
" <td>152.50</td>\n",
" <td>1709.0</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.2430</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0.28390</td>\n",
" <td>0.2414</td>\n",
" <td>0.10520</td>\n",
" <td>0.2597</td>\n",
" <td>...</td>\n",
" <td>14.91</td>\n",
" <td>26.50</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.17300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0.13280</td>\n",
" <td>0.1980</td>\n",
" <td>0.10430</td>\n",
" <td>0.1809</td>\n",
" <td>...</td>\n",
" <td>22.54</td>\n",
" <td>16.67</td>\n",
" <td>152.20</td>\n",
" <td>1575.0</td>\n",
" <td>0.1374</td>\n",
" <td>0.2050</td>\n",
" <td>0.4000</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n",
"0 1 17.99 10.38 122.80 1001.0 \n",
"1 1 20.57 17.77 132.90 1326.0 \n",
"2 1 19.69 21.25 130.00 1203.0 \n",
"3 1 11.42 20.38 77.58 386.1 \n",
"4 1 20.29 14.34 135.10 1297.0 \n",
"\n",
" smoothness_mean compactness_mean concavity_mean concave points_mean \\\n",
"0 0.11840 0.27760 0.3001 0.14710 \n",
"1 0.08474 0.07864 0.0869 0.07017 \n",
"2 0.10960 0.15990 0.1974 0.12790 \n",
"3 0.14250 0.28390 0.2414 0.10520 \n",
"4 0.10030 0.13280 0.1980 0.10430 \n",
"\n",
" symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n",
"0 0.2419 ... 25.38 17.33 184.60 \n",
"1 0.1812 ... 24.99 23.41 158.80 \n",
"2 0.2069 ... 23.57 25.53 152.50 \n",
"3 0.2597 ... 14.91 26.50 98.87 \n",
"4 0.1809 ... 22.54 16.67 152.20 \n",
"\n",
" area_worst smoothness_worst compactness_worst concavity_worst \\\n",
"0 2019.0 0.1622 0.6656 0.7119 \n",
"1 1956.0 0.1238 0.1866 0.2416 \n",
"2 1709.0 0.1444 0.4245 0.4504 \n",
"3 567.7 0.2098 0.8663 0.6869 \n",
"4 1575.0 0.1374 0.2050 0.4000 \n",
"\n",
" concave points_worst symmetry_worst fractal_dimension_worst \n",
"0 0.2654 0.4601 0.11890 \n",
"1 0.1860 0.2750 0.08902 \n",
"2 0.2430 0.3613 0.08758 \n",
"3 0.2575 0.6638 0.17300 \n",
"4 0.1625 0.2364 0.07678 \n",
"\n",
"[5 rows x 31 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now view fixed data\n",
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Learning itself\n",
"### Split sets"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x: texture_mean perimeter_mean area_mean smoothness_mean \\\n",
"0 10.38 122.80 1001.0 0.11840 \n",
"1 17.77 132.90 1326.0 0.08474 \n",
"2 21.25 130.00 1203.0 0.10960 \n",
"3 20.38 77.58 386.1 0.14250 \n",
"4 14.34 135.10 1297.0 0.10030 \n",
".. ... ... ... ... \n",
"564 22.39 142.00 1479.0 0.11100 \n",
"565 28.25 131.20 1261.0 0.09780 \n",
"566 28.08 108.30 858.1 0.08455 \n",
"567 29.33 140.10 1265.0 0.11780 \n",
"568 24.54 47.92 181.0 0.05263 \n",
"\n",
" compactness_mean concavity_mean concave points_mean symmetry_mean \\\n",
"0 0.27760 0.30010 0.14710 0.2419 \n",
"1 0.07864 0.08690 0.07017 0.1812 \n",
"2 0.15990 0.19740 0.12790 0.2069 \n",
"3 0.28390 0.24140 0.10520 0.2597 \n",
"4 0.13280 0.19800 0.10430 0.1809 \n",
".. ... ... ... ... \n",
"564 0.11590 0.24390 0.13890 0.1726 \n",
"565 0.10340 0.14400 0.09791 0.1752 \n",
"566 0.10230 0.09251 0.05302 0.1590 \n",
"567 0.27700 0.35140 0.15200 0.2397 \n",
"568 0.04362 0.00000 0.00000 0.1587 \n",
"\n",
" fractal_dimension_mean radius_se ... radius_worst texture_worst \\\n",
"0 0.07871 1.0950 ... 25.380 17.33 \n",
"1 0.05667 0.5435 ... 24.990 23.41 \n",
"2 0.05999 0.7456 ... 23.570 25.53 \n",
"3 0.09744 0.4956 ... 14.910 26.50 \n",
"4 0.05883 0.7572 ... 22.540 16.67 \n",
".. ... ... ... ... ... \n",
"564 0.05623 1.1760 ... 25.450 26.40 \n",
"565 0.05533 0.7655 ... 23.690 38.25 \n",
"566 0.05648 0.4564 ... 18.980 34.12 \n",
"567 0.07016 0.7260 ... 25.740 39.42 \n",
"568 0.05884 0.3857 ... 9.456 30.37 \n",
"\n",
" perimeter_worst area_worst smoothness_worst compactness_worst \\\n",
"0 184.60 2019.0 0.16220 0.66560 \n",
"1 158.80 1956.0 0.12380 0.18660 \n",
"2 152.50 1709.0 0.14440 0.42450 \n",
"3 98.87 567.7 0.20980 0.86630 \n",
"4 152.20 1575.0 0.13740 0.20500 \n",
".. ... ... ... ... \n",
"564 166.10 2027.0 0.14100 0.21130 \n",
"565 155.00 1731.0 0.11660 0.19220 \n",
"566 126.70 1124.0 0.11390 0.30940 \n",
"567 184.60 1821.0 0.16500 0.86810 \n",
"568 59.16 268.6 0.08996 0.06444 \n",
"\n",
" concavity_worst concave points_worst symmetry_worst \\\n",
"0 0.7119 0.2654 0.4601 \n",
"1 0.2416 0.1860 0.2750 \n",
"2 0.4504 0.2430 0.3613 \n",
"3 0.6869 0.2575 0.6638 \n",
"4 0.4000 0.1625 0.2364 \n",
".. ... ... ... \n",
"564 0.4107 0.2216 0.2060 \n",
"565 0.3215 0.1628 0.2572 \n",
"566 0.3403 0.1418 0.2218 \n",
"567 0.9387 0.2650 0.4087 \n",
"568 0.0000 0.0000 0.2871 \n",
"\n",
" fractal_dimension_worst \n",
"0 0.11890 \n",
"1 0.08902 \n",
"2 0.08758 \n",
"3 0.17300 \n",
"4 0.07678 \n",
".. ... \n",
"564 0.07115 \n",
"565 0.06637 \n",
"566 0.07820 \n",
"567 0.12400 \n",
"568 0.07039 \n",
"\n",
"[569 rows x 29 columns]\n",
"y: 0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
" ..\n",
"564 1\n",
"565 1\n",
"566 1\n",
"567 1\n",
"568 0\n",
"Name: diagnosis, Length: 569, dtype: uint8\n"
]
}
],
"source": [
"x = data.iloc[:, 2:].values # values we want to classify - we only want\n",
"print(\"x: \", data.iloc[:, 2:])\n",
"y = data.iloc[:, 0].values # acceptances for each row (ie either benign (0) or malignant (1))\n",
"print(\"y: \", data.iloc[:, 0])\n",
"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=0) # split dataset into train, test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train model"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"model = LogisticRegression(max_iter=20000)\n",
"model.fit(x_train, y_train)\n",
"predictions = model.predict(x_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test accuracy using testing values"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Predicted values:</th>\n",
" <th>Actual values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Predicted values: Actual values\n",
"0 1 1\n",
"1 0 0\n",
"2 0 0\n",
"3 0 0\n",
"4 0 0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test_pred = model.predict(x_test) # based on our model, give it values to try to predict with\n",
"pred_vs_actual = pd.DataFrame({\"Predicted values:\": y_test_pred, \"Actual values\": y_test})\n",
"pred_vs_actual.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean squared error: 0.04895104895104895\n",
"Mean absolute error: 1.8251748251748252\n",
"Model accuracy: 0.951\n"
]
}
],
"source": [
"print(\"Mean squared error:\", mean_squared_error(y_test, y_test_pred))\n",
"print(\"Mean absolute error:\", mean_absolute_error(y_test, y_test_pred))\n",
"accuracy = model.score(x_test, y_test) # or simply called score method to use the models inherent predictions vs a dataset / subset we give it \n",
"print(\"Model accuracy: {:.3f}\".format(accuracy)) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}