From 7f1f3d26e60e4ea92e4c5eb1cbffa505d9cfb811 Mon Sep 17 00:00:00 2001
From: "Yijun Li (liy312)" <liy312@coventry.ac.uk>
Date: Tue, 29 Nov 2022 07:32:52 +0000
Subject: [PATCH] The code of this project.

---
 coursework.ipynb | 914 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 914 insertions(+)
 create mode 100644 coursework.ipynb

diff --git a/coursework.ipynb b/coursework.ipynb
new file mode 100644
index 0000000..42f13b9
--- /dev/null
+++ b/coursework.ipynb
@@ -0,0 +1,914 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Section 1: Input and process data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import the libraries are required\n",
+    "import pandas as pd                     # efficient data analysis tool library\n",
+    "import numpy as np                      # A numerical computation extension of an open element\n",
+    "import matplotlib.pyplot as plt         # 2d drawing library\n",
+    "import warnings                         \n",
+    "%matplotlib inline\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1.1 data input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")  # read the data from dataset\n",
+    "df.head()   # check the data frame"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1.2 cherck the data structure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.info()   # displays the information\n",
+    "df.shape    # displays the shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1.3 drop the unvalued data and repeat data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop(\"id\",axis=1,inplace=True)   # remove the unvalued data\n",
+    "df = df.drop_duplicates()           # delete the repeat data\n",
+    "df.shape                            # check the frame shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.describe()           # check data description statistics "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1.4 check the unique and null data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.nunique()            # check the unique value in the data\n",
+    "df.isnull().any()       # find null value"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1.5 deal with the outliers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fill the part of bmi who has null with its mean value\n",
+    "df[\"bmi\"].fillna(df[\"bmi\"].mean(),inplace=True)\n",
+    "df.isnull().any()       # check the data after process "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find the unique type for all the cols\n",
+    "columns = [\"gender\", \"hypertension\", \"heart_disease\", \"ever_married\", \"work_type\", \"Residence_type\", \"smoking_status\", \"stroke\"]\n",
+    "for i in range(0, len(columns)):\n",
+    "    colm = columns[i]\n",
+    "    print(\"The unique data of\", colm, \"is:\", df[colm].unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check the num of other\n",
+    "df[\"gender\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# since just one special type, just remove it\n",
+    "df = df.drop(df[df[\"gender\"] == \"Other\"].index)\n",
+    "df[\"gender\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check the how mach Unknow in smoking_status\n",
+    "df[\"smoking_status\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use mode to replace unknow value\n",
+    "from numpy import nan\n",
+    "df[\"smoking_status\"].replace(\"Unknown\", nan, inplace=True)  # first, replace Unknow to nan\n",
+    "df[\"smoking_status\"].fillna(df[\"smoking_status\"].mode()[0], inplace=True)   # and then replace to the mode\n",
+    "df[\"smoking_status\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Section 2: Data analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2.1 Analysis of discrete variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check for the stroke distribution\n",
+    "plt.figure(figsize=(8,6))           # define the figure size\n",
+    "plt.title(\"The rate of stroke\")     # define the title\n",
+    "labels = df[\"stroke\"].value_counts().index\n",
+    "plt.pie(df[\"stroke\"].value_counts(), labels=labels,autopct='%1.2f%%',shadow=True,explode=(0,1))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# explore the relationship between stroke and rest of indicators\n",
+    "import seaborn as sns\n",
+    "# list of data to analysis\n",
+    "columns = [\"gender\",\"heart_disease\", \"ever_married\", \"work_type\", \"Residence_type\", \"smoking_status\"]\n",
+    "# define plot size\n",
+    "plt.figure(figsize=(2*15,3*12))\n",
+    "plt.rcParams[\"font.size\"] = 20\n",
+    "for col in columns:\n",
+    "    idx = columns.index(col) + 1\n",
+    "    xn = plt.subplot(3,2,idx)\n",
+    "    sns.countplot(df[\"stroke\"], hue = df[col])\n",
+    "    plt.ylabel(\"headcount\")\n",
+    "    xn.set_title(\"Correlation between \"f\"{col}\"\" and stroke\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2.2 Analysis of continuous variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure = plt.figure(figsize=(8,6))\n",
+    "plt.title(\"age and bmi\")\n",
+    "sns.scatterplot(x=df[\"age\"], y=df[\"bmi\"],hue=df[\"stroke\"],style=df[\"stroke\"])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure = plt.figure(figsize=(8,6))\n",
+    "plt.title(\"age and avg_glucose_level\")\n",
+    "sns.scatterplot(x=df[\"age\"], y=df[\"avg_glucose_level\"],hue=df[\"stroke\"],style=df[\"stroke\"])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure = plt.figure(figsize=(8,6))\n",
+    "plt.title(\"bmi and avg_glucose_level\")\n",
+    "sns.scatterplot(x=df[\"bmi\"], y=df[\"avg_glucose_level\"],hue=df[\"stroke\"],style=df[\"stroke\"])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# display the distribution of avg_glucose_level\n",
+    "plt.figure(figsize=(10,6))\n",
+    "plt.title(\"The distribution of avg_glucose_level\")\n",
+    "sns.kdeplot(df[\"avg_glucose_level\"])\n",
+    "# save the stroke == 1 data from avg_glucose_level\n",
+    "stroker = df.query(\"stroke==1\")[\"avg_glucose_level\"]\n",
+    "sns.kdeplot(stroker)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# display the distribution of bmi\n",
+    "plt.figure(figsize=(10,6))\n",
+    "plt.title(\"The distribution of bmi\")\n",
+    "sns.kdeplot(df[\"bmi\"])\n",
+    "# save the stroke == 1 data from bmi\n",
+    "stroker = df.query(\"stroke==1\")[\"bmi\"]\n",
+    "sns.kdeplot(stroker)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2.3 Eigenvalue digitization and analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use get_dummies and label encoder to encode data\n",
+    "from pandas import get_dummies\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "# OneHotEncode the data has more than 2 categories\n",
+    "df1 = get_dummies(df, columns=[\"work_type\", \"Residence_type\", \"smoking_status\"])\n",
+    "encoder = LabelEncoder()\n",
+    "# LabelEncode the data binary classification data\n",
+    "df1[\"gender\"] = encoder.fit_transform(df1[\"gender\"])\n",
+    "df1[\"ever_married\"] = encoder.fit_transform(df1[\"ever_married\"])\n",
+    "df1.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build a correlation matrix to see the correlation between columns. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot a hotmap to explore the correlation between each data\n",
+    "clomns = [\"age\",\"hypertension\",\"heart_disease\",\"avg_glucose_level\",\"bmi\",\"gender\",\"ever_married\",\"work_type_Never_worked\",\"work_type_Private\",\n",
+    "\"work_type_Self-employed\",\"work_type_children\",\"Residence_type_Rural\",\"smoking_status_formerly smoked\",\"smoking_status_never smoked\",\"smoking_status_smokes\"]\n",
+    "data = df1[clomns]\n",
+    "correlation_matrix = data.corr().round(2)\n",
+    "plt.figure(figsize=(25,15),dpi=200)\n",
+    "sns.heatmap(correlation_matrix, annot=True)\n",
+    "plt.title(\"correlation between classes\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Plot a bar chart to show the correlation between stroke and variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(15,5))\n",
+    "plt.title(\"The correlation between stroke and variables\")\n",
+    "# use sort_value to distribute the data from high to low\n",
+    "df1.corr()[\"stroke\"].sort_values(ascending = False).plot(kind = \"bar\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# explore the stroker distrubute in different ages\n",
+    "plt.figure(figsize=(10,6))\n",
+    "plt.title(\"Changes in stroke probability with age\")\n",
+    "# plot the density distribution of people who have not stroke with ages. \n",
+    "sns.kdeplot(df1.age[df1.stroke==0],color=\"b\",shade=True,label=\"stroke=0\")\n",
+    "# plot the density distribution of people who have stroke with ages. \n",
+    "sns.kdeplot(df1.age[df1.stroke==1],color=\"r\",shade=True,label=\"stroke=1\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2.4 Features Selection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop the low correlation data\n",
+    "df1.drop([\"gender\",\"Residence_type_Rural\",\"Residence_type_Urban\"], axis = 1, inplace=True)\n",
+    "df1.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# scaler the continuous variable to plot a boxplot to find the outliers\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "ss = StandardScaler()\n",
+    "data_check = df1[[\"age\", \"bmi\", \"avg_glucose_level\"]]\n",
+    "data_check[[\"age\", \"bmi\", \"avg_glucose_level\"]] = ss.fit_transform(data_check[[\"age\", \"bmi\", \"avg_glucose_level\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use the box plot to check is any outliers in the data\n",
+    "plt.figure(figsize = (10,5),dpi=120)  # use high dpi is greater to analysis the plot\n",
+    "plt.title(\"The value distribution of age and avg_glucose_level and bmi\")\n",
+    "boxes = sns.boxplot(data=data_check[[\"age\",\"avg_glucose_level\",\"bmi\"]])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the quartile method to find outliers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_to_test = df1[[\"avg_glucose_level\",\"bmi\"]]\n",
+    "\n",
+    "def delete_outliers(data,cloms,n):              # define quartile function\n",
+    "    indices = []                                # find the outlier's index\n",
+    "    for clom in cloms:\n",
+    "        q1 = np.percentile(data[clom],25)       # define the quarter 1\n",
+    "        q3 = np.percentile(data[clom],75)       # define the quarter 3\n",
+    "        IQR = q3 - q1                           # IQR is the aera between q1 and q3\n",
+    "        outlier_step = 1.5 * IQR                \n",
+    "        outlier_colm = data[(data[clom] < q1 - outlier_step) | (data[clom] > q3 + outlier_step)].index  # the value is over than q3 + 1.5 IQR or less than q1 - 1.5 IQR will be recognized as outlier\n",
+    "        indices.extend(outlier_colm)\n",
+    "    from collections import Counter\n",
+    "    indices = Counter(indices)                  # counter the occur times of the outlier\n",
+    "    outliers_ = list(k for k, v in indices.items() if v >= n)           # if the occur times of the outlier is over than n(conditions), then they will be returned\n",
+    "    return outliers_    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outliers = delete_outliers(data_to_test, [\"avg_glucose_level\", \"bmi\"], 2) \n",
+    "len(outliers)       #print the outlers number"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = df1.drop(outliers,axis=0).reset_index(drop=True)          # removed the outliers from original dataset\n",
+    "df1.shape     # check the shape after drop"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Section 3: establish analysis models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.1 Build the train and test data sets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.1.1 scaler the large distribution data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# scaler the data is not evenly distributed\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "mms = StandardScaler()\n",
+    "df1[[\"avg_glucose_level\"]] = mms.fit_transform(df1[[\"avg_glucose_level\"]])\n",
+    "df1[\"avg_glucose_level\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.1.2 split the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = df1.drop([\"stroke\"],axis=1)\n",
+    "# feature data\n",
+    "X = train_data.iloc[:,:]\n",
+    "# prediction data\n",
+    "y = df1.iloc[:,6]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split the data to build model\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "# split the 75% date use for training, and 25% use for test\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.1.3 use SMOTE to oversample the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from imblearn.over_sampling import SMOTE\n",
+    "smo = SMOTE(random_state=32)\n",
+    "X_train, y_train = smo.fit_resample(X_train,y_train.ravel())\n",
+    "# check the data after SMOTE\n",
+    "# count the stroke num after SMOTE\n",
+    "print(\"After SMOTE the stroke number is :\", sum(y_train == 1))\n",
+    "# count the non-stroke num after SMOTE\n",
+    "print(\"After SMOTE the non-stroke number is :\", sum(y_train == 0))\n",
+    "X_train.shape, y_train.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.1.4 Create a function to execute models and evaluate them"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Define a function to run the models and make the scores, and then record them in a list."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import the evlation methods\n",
+    "from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "results = []    # where the evaluate are stored\n",
+    "def evaluator(model):\n",
+    "    # Put the data into the model\n",
+    "    model.fit(X_train,y_train)\n",
+    "    # get the predict data\n",
+    "    y_pred = model.predict(X_test)\n",
+    "    # a series of evaluation methods\n",
+    "    cm = confusion_matrix(y_test, y_pred)\n",
+    "    # accuracy score\n",
+    "    accuracy = accuracy_score(y_test, y_pred)\n",
+    "    # cross validation\n",
+    "    cvs = cross_val_score(model,X_train,y_train,cv=4)\n",
+    "    # roc auc score\n",
+    "    roc_auc = roc_auc_score(y_test, y_pred)\n",
+    "    # recall score\n",
+    "    recall = recall_score(y_test, y_pred)\n",
+    "    # precision score\n",
+    "    precision = precision_score(y_test,y_pred)\n",
+    "    f1 = f1_score(y_test,y_pred)\n",
+    "    # print the results\n",
+    "    print(model,\":\")\n",
+    "    print(cm)\n",
+    "    print(\"accuracy score:\", accuracy)\n",
+    "    print(\"cvs mean score:\", cvs.mean())\n",
+    "    print(\"roc auc score:\", roc_auc)\n",
+    "    print(\"recall:\", recall)\n",
+    "    print(\"Precision:\", precision)\n",
+    "    print(\"f1 score:\", f1)\n",
+    "    # append to a list to further comparison\n",
+    "    results.append([accuracy, cvs.mean(), roc_auc, recall, precision, f1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.2 Analysis of model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.2.1 Logistic Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "lr = LogisticRegression()\n",
+    "evaluator(lr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import classification_report\n",
+    "y_pred = lr.predict(X_test)\n",
+    "y_prob = lr.predict_proba(X_test)[:,1]\n",
+    "print(classification_report(y_test,y_pred))\n",
+    "print('Best accuracy : ', accuracy_score(y_test,y_pred))\n",
+    "print('roc auc score: ',roc_auc_score(y_test,y_prob))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.2.2 Random Forest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "rf = RandomForestClassifier()\n",
+    "evaluator(rf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.2.3 Support Vector Machine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.svm import SVC\n",
+    "svm = SVC(kernel='rbf')\n",
+    "evaluator(svm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3.2.4 XG Boost"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from xgboost import XGBClassifier\n",
+    "xgb = XGBClassifier()\n",
+    "evaluator(xgb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Section 4: Comparison of scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  keep four decimal places of all results\n",
+    "results = np.round(results,4)\n",
+    "results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# list models\n",
+    "models = [\"LogisticRegression\",\"RandomForest\",\"Support Vector Machine\",\"XG Boosting\"]\n",
+    "# list of evaluation method\n",
+    "cloms = [\"model\",\"accuracy\",\"cvs mean\",\"roc_auc\",\"recall\",\"precision\",\"f1\"]\n",
+    "results_compare = pd.DataFrame(columns=cloms)\n",
+    "results_compare[\"model\"] = [i for i in models]\n",
+    "for i in range(0,len(results) + 2):\n",
+    "    results_compare[cloms[i + 1]] = results[:,i]\n",
+    "    # sort them by accuracy and cvs mean\n",
+    "results_compare.sort_values(by=[\"f1\",\"accuracy\"],inplace=True,ascending=False)\n",
+    "results_compare"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Section 5: model tuning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "6.1 tune the Logistic regression model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "lr_grid = GridSearchCV(lr, [{\"penalty\":[\"none\",\"l1\",\"l2\"], \"C\":[0, 5, 10]}], scoring = \"accuracy\", cv=10)\n",
+    "lr_grid.fit(X_train,y_train)\n",
+    "best_accuracy = lr_grid.best_score_\n",
+    "bset_parameters = lr_grid.best_params_\n",
+    "print(\"model\", lr, \"\\n best accuracy is:\", best_accuracy)\n",
+    "print(\"best parameters are:\", bset_parameters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import classification_report\n",
+    "y_pred = lr_grid.predict(X_test)\n",
+    "print(classification_report(y_test,y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cfm = confusion_matrix(y_test, y_pred)\n",
+    "sns.heatmap(cfm, annot=True, fmt = 'd', cmap='Blues')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "6.2 tune the XGB model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xgb_grid = GridSearchCV(rf, [{\"n_estimators\":[100,150,200], \"criterion\": [\"gini\", \"entropy\"], \"random_state\": [0]}], scoring = \"accuracy\", cv=10)\n",
+    "xgb_grid.fit(X_train,y_train)\n",
+    "best_accuracy = xgb_grid.best_score_\n",
+    "bset_parameters = xgb_grid.best_params_\n",
+    "print(\"model\", rf, \"\\n best accuracy is:\", best_accuracy)\n",
+    "print(\"parameters are:\", bset_parameters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = xgb_grid.predict(X_test)\n",
+    "print(classification_report(y_test,y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot the rf confusion matrix\n",
+    "cfm = confusion_matrix(y_test, y_pred)\n",
+    "sns.heatmap(cfm, annot=True, fmt = 'd', cmap='Blues')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svm_grid = GridSearchCV(svm, [{'kernel':['sigmoid','poly'],'C':[0, 5, 10],'gamma':[0.001, 0.0001]}], scoring = \"accuracy\", cv=10)\n",
+    "svm_grid.fit(X_train,y_train)\n",
+    "best_accuracy = svm_grid.best_score_\n",
+    "bset_parameters = svm_grid.best_params_\n",
+    "print(\"model\", svm, \"\\n best accuracy is:\", best_accuracy)\n",
+    "print(\"parameters are:\", bset_parameters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = svm_grid.predict(X_test)\n",
+    "print(classification_report(y_test,y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot the rf confusion matrix\n",
+    "cfm = confusion_matrix(y_test, y_pred)\n",
+    "sns.heatmap(cfm, annot=True, fmt = 'd', cmap='Blues')\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.6 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "865d8b2eb28e274047ba64063dfb6a2aabf0dfec4905d304d7a76618dae6fdd4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}