From 7f1f3d26e60e4ea92e4c5eb1cbffa505d9cfb811 Mon Sep 17 00:00:00 2001 From: "Yijun Li (liy312)" Date: Tue, 29 Nov 2022 07:32:52 +0000 Subject: [PATCH] The code of this project. --- coursework.ipynb | 914 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 914 insertions(+) create mode 100644 coursework.ipynb diff --git a/coursework.ipynb b/coursework.ipynb new file mode 100644 index 0000000..42f13b9 --- /dev/null +++ b/coursework.ipynb @@ -0,0 +1,914 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Section 1: Input and process data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import the libraries are required\n", + "import pandas as pd # efficient data analysis tool library\n", + "import numpy as np # A numerical computation extension of an open element\n", + "import matplotlib.pyplot as plt # 2d drawing library\n", + "import warnings \n", + "%matplotlib inline\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.1 data input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\") # read the data from dataset\n", + "df.head() # check the data frame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.2 cherck the data structure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.info() # displays the information\n", + "df.shape # displays the shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.3 drop the unvalued data and repeat data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(\"id\",axis=1,inplace=True) # remove the unvalued data\n", + "df = df.drop_duplicates() # delete the repeat data\n", + "df.shape # check the frame shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.describe() # check data description statistics " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.4 check the unique and null data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.nunique() # check the unique value in the data\n", + "df.isnull().any() # find null value" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.5 deal with the outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fill the part of bmi who has null with its mean value\n", + "df[\"bmi\"].fillna(df[\"bmi\"].mean(),inplace=True)\n", + "df.isnull().any() # check the data after process " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# find the unique type for all the cols\n", + "columns = [\"gender\", \"hypertension\", \"heart_disease\", \"ever_married\", \"work_type\", \"Residence_type\", \"smoking_status\", \"stroke\"]\n", + "for i in range(0, len(columns)):\n", + " colm = columns[i]\n", + " print(\"The unique data of\", colm, \"is:\", df[colm].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check the num of other\n", + "df[\"gender\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# since just one special type, just remove it\n", + "df = df.drop(df[df[\"gender\"] == \"Other\"].index)\n", + "df[\"gender\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check the how mach Unknow in smoking_status\n", + "df[\"smoking_status\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use mode to replace unknow value\n", + "from numpy import nan\n", + "df[\"smoking_status\"].replace(\"Unknown\", nan, inplace=True) # first, replace Unknow to nan\n", + "df[\"smoking_status\"].fillna(df[\"smoking_status\"].mode()[0], inplace=True) # and then replace to the mode\n", + "df[\"smoking_status\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Section 2: Data analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.1 Analysis of discrete variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check for the stroke distribution\n", + "plt.figure(figsize=(8,6)) # define the figure size\n", + "plt.title(\"The rate of stroke\") # define the title\n", + "labels = df[\"stroke\"].value_counts().index\n", + "plt.pie(df[\"stroke\"].value_counts(), labels=labels,autopct='%1.2f%%',shadow=True,explode=(0,1))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# explore the relationship between stroke and rest of indicators\n", + "import seaborn as sns\n", + "# list of data to analysis\n", + "columns = [\"gender\",\"heart_disease\", \"ever_married\", \"work_type\", \"Residence_type\", \"smoking_status\"]\n", + "# define plot size\n", + "plt.figure(figsize=(2*15,3*12))\n", + "plt.rcParams[\"font.size\"] = 20\n", + "for col in columns:\n", + " idx = columns.index(col) + 1\n", + " xn = plt.subplot(3,2,idx)\n", + " sns.countplot(df[\"stroke\"], hue = df[col])\n", + " plt.ylabel(\"headcount\")\n", + " xn.set_title(\"Correlation between \"f\"{col}\"\" and stroke\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.2 Analysis of continuous variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "figure = plt.figure(figsize=(8,6))\n", + "plt.title(\"age and bmi\")\n", + "sns.scatterplot(x=df[\"age\"], y=df[\"bmi\"],hue=df[\"stroke\"],style=df[\"stroke\"])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "figure = plt.figure(figsize=(8,6))\n", + "plt.title(\"age and avg_glucose_level\")\n", + "sns.scatterplot(x=df[\"age\"], y=df[\"avg_glucose_level\"],hue=df[\"stroke\"],style=df[\"stroke\"])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "figure = plt.figure(figsize=(8,6))\n", + "plt.title(\"bmi and avg_glucose_level\")\n", + "sns.scatterplot(x=df[\"bmi\"], y=df[\"avg_glucose_level\"],hue=df[\"stroke\"],style=df[\"stroke\"])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# display the distribution of avg_glucose_level\n", + "plt.figure(figsize=(10,6))\n", + "plt.title(\"The distribution of avg_glucose_level\")\n", + "sns.kdeplot(df[\"avg_glucose_level\"])\n", + "# save the stroke == 1 data from avg_glucose_level\n", + "stroker = df.query(\"stroke==1\")[\"avg_glucose_level\"]\n", + "sns.kdeplot(stroker)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# display the distribution of bmi\n", + "plt.figure(figsize=(10,6))\n", + "plt.title(\"The distribution of bmi\")\n", + "sns.kdeplot(df[\"bmi\"])\n", + "# save the stroke == 1 data from bmi\n", + "stroker = df.query(\"stroke==1\")[\"bmi\"]\n", + "sns.kdeplot(stroker)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.3 Eigenvalue digitization and analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use get_dummies and label encoder to encode data\n", + "from pandas import get_dummies\n", + "from sklearn.preprocessing import LabelEncoder\n", + "# OneHotEncode the data has more than 2 categories\n", + "df1 = get_dummies(df, columns=[\"work_type\", \"Residence_type\", \"smoking_status\"])\n", + "encoder = LabelEncoder()\n", + "# LabelEncode the data binary classification data\n", + "df1[\"gender\"] = encoder.fit_transform(df1[\"gender\"])\n", + "df1[\"ever_married\"] = encoder.fit_transform(df1[\"ever_married\"])\n", + "df1.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Build a correlation matrix to see the correlation between columns. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot a hotmap to explore the correlation between each data\n", + "clomns = [\"age\",\"hypertension\",\"heart_disease\",\"avg_glucose_level\",\"bmi\",\"gender\",\"ever_married\",\"work_type_Never_worked\",\"work_type_Private\",\n", + "\"work_type_Self-employed\",\"work_type_children\",\"Residence_type_Rural\",\"smoking_status_formerly smoked\",\"smoking_status_never smoked\",\"smoking_status_smokes\"]\n", + "data = df1[clomns]\n", + "correlation_matrix = data.corr().round(2)\n", + "plt.figure(figsize=(25,15),dpi=200)\n", + "sns.heatmap(correlation_matrix, annot=True)\n", + "plt.title(\"correlation between classes\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot a bar chart to show the correlation between stroke and variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(15,5))\n", + "plt.title(\"The correlation between stroke and variables\")\n", + "# use sort_value to distribute the data from high to low\n", + "df1.corr()[\"stroke\"].sort_values(ascending = False).plot(kind = \"bar\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# explore the stroker distrubute in different ages\n", + "plt.figure(figsize=(10,6))\n", + "plt.title(\"Changes in stroke probability with age\")\n", + "# plot the density distribution of people who have not stroke with ages. \n", + "sns.kdeplot(df1.age[df1.stroke==0],color=\"b\",shade=True,label=\"stroke=0\")\n", + "# plot the density distribution of people who have stroke with ages. \n", + "sns.kdeplot(df1.age[df1.stroke==1],color=\"r\",shade=True,label=\"stroke=1\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.4 Features Selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# drop the low correlation data\n", + "df1.drop([\"gender\",\"Residence_type_Rural\",\"Residence_type_Urban\"], axis = 1, inplace=True)\n", + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# scaler the continuous variable to plot a boxplot to find the outliers\n", + "from sklearn.preprocessing import StandardScaler\n", + "ss = StandardScaler()\n", + "data_check = df1[[\"age\", \"bmi\", \"avg_glucose_level\"]]\n", + "data_check[[\"age\", \"bmi\", \"avg_glucose_level\"]] = ss.fit_transform(data_check[[\"age\", \"bmi\", \"avg_glucose_level\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use the box plot to check is any outliers in the data\n", + "plt.figure(figsize = (10,5),dpi=120) # use high dpi is greater to analysis the plot\n", + "plt.title(\"The value distribution of age and avg_glucose_level and bmi\")\n", + "boxes = sns.boxplot(data=data_check[[\"age\",\"avg_glucose_level\",\"bmi\"]])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the quartile method to find outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_to_test = df1[[\"avg_glucose_level\",\"bmi\"]]\n", + "\n", + "def delete_outliers(data,cloms,n): # define quartile function\n", + " indices = [] # find the outlier's index\n", + " for clom in cloms:\n", + " q1 = np.percentile(data[clom],25) # define the quarter 1\n", + " q3 = np.percentile(data[clom],75) # define the quarter 3\n", + " IQR = q3 - q1 # IQR is the aera between q1 and q3\n", + " outlier_step = 1.5 * IQR \n", + " outlier_colm = data[(data[clom] < q1 - outlier_step) | (data[clom] > q3 + outlier_step)].index # the value is over than q3 + 1.5 IQR or less than q1 - 1.5 IQR will be recognized as outlier\n", + " indices.extend(outlier_colm)\n", + " from collections import Counter\n", + " indices = Counter(indices) # counter the occur times of the outlier\n", + " outliers_ = list(k for k, v in indices.items() if v >= n) # if the occur times of the outlier is over than n(conditions), then they will be returned\n", + " return outliers_ " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "outliers = delete_outliers(data_to_test, [\"avg_glucose_level\", \"bmi\"], 2) \n", + "len(outliers) #print the outlers number" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = df1.drop(outliers,axis=0).reset_index(drop=True) # removed the outliers from original dataset\n", + "df1.shape # check the shape after drop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Section 3: establish analysis models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1 Build the train and test data sets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1.1 scaler the large distribution data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# scaler the data is not evenly distributed\n", + "from sklearn.preprocessing import StandardScaler\n", + "mms = StandardScaler()\n", + "df1[[\"avg_glucose_level\"]] = mms.fit_transform(df1[[\"avg_glucose_level\"]])\n", + "df1[\"avg_glucose_level\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1.2 split the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = df1.drop([\"stroke\"],axis=1)\n", + "# feature data\n", + "X = train_data.iloc[:,:]\n", + "# prediction data\n", + "y = df1.iloc[:,6]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# split the data to build model\n", + "from sklearn.model_selection import train_test_split\n", + "# split the 75% date use for training, and 25% use for test\n", + "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1.3 use SMOTE to oversample the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from imblearn.over_sampling import SMOTE\n", + "smo = SMOTE(random_state=32)\n", + "X_train, y_train = smo.fit_resample(X_train,y_train.ravel())\n", + "# check the data after SMOTE\n", + "# count the stroke num after SMOTE\n", + "print(\"After SMOTE the stroke number is :\", sum(y_train == 1))\n", + "# count the non-stroke num after SMOTE\n", + "print(\"After SMOTE the non-stroke number is :\", sum(y_train == 0))\n", + "X_train.shape, y_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1.4 Create a function to execute models and evaluate them" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define a function to run the models and make the scores, and then record them in a list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import the evlation methods\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score\n", + "from sklearn.model_selection import cross_val_score\n", + "results = [] # where the evaluate are stored\n", + "def evaluator(model):\n", + " # Put the data into the model\n", + " model.fit(X_train,y_train)\n", + " # get the predict data\n", + " y_pred = model.predict(X_test)\n", + " # a series of evaluation methods\n", + " cm = confusion_matrix(y_test, y_pred)\n", + " # accuracy score\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " # cross validation\n", + " cvs = cross_val_score(model,X_train,y_train,cv=4)\n", + " # roc auc score\n", + " roc_auc = roc_auc_score(y_test, y_pred)\n", + " # recall score\n", + " recall = recall_score(y_test, y_pred)\n", + " # precision score\n", + " precision = precision_score(y_test,y_pred)\n", + " f1 = f1_score(y_test,y_pred)\n", + " # print the results\n", + " print(model,\":\")\n", + " print(cm)\n", + " print(\"accuracy score:\", accuracy)\n", + " print(\"cvs mean score:\", cvs.mean())\n", + " print(\"roc auc score:\", roc_auc)\n", + " print(\"recall:\", recall)\n", + " print(\"Precision:\", precision)\n", + " print(\"f1 score:\", f1)\n", + " # append to a list to further comparison\n", + " results.append([accuracy, cvs.mean(), roc_auc, recall, precision, f1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.2 Analysis of model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.2.1 Logistic Regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "lr = LogisticRegression()\n", + "evaluator(lr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report\n", + "y_pred = lr.predict(X_test)\n", + "y_prob = lr.predict_proba(X_test)[:,1]\n", + "print(classification_report(y_test,y_pred))\n", + "print('Best accuracy : ', accuracy_score(y_test,y_pred))\n", + "print('roc auc score: ',roc_auc_score(y_test,y_prob))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.2.2 Random Forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "rf = RandomForestClassifier()\n", + "evaluator(rf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.2.3 Support Vector Machine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC\n", + "svm = SVC(kernel='rbf')\n", + "evaluator(svm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.2.4 XG Boost" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from xgboost import XGBClassifier\n", + "xgb = XGBClassifier()\n", + "evaluator(xgb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Section 4: Comparison of scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# keep four decimal places of all results\n", + "results = np.round(results,4)\n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# list models\n", + "models = [\"LogisticRegression\",\"RandomForest\",\"Support Vector Machine\",\"XG Boosting\"]\n", + "# list of evaluation method\n", + "cloms = [\"model\",\"accuracy\",\"cvs mean\",\"roc_auc\",\"recall\",\"precision\",\"f1\"]\n", + "results_compare = pd.DataFrame(columns=cloms)\n", + "results_compare[\"model\"] = [i for i in models]\n", + "for i in range(0,len(results) + 2):\n", + " results_compare[cloms[i + 1]] = results[:,i]\n", + " # sort them by accuracy and cvs mean\n", + "results_compare.sort_values(by=[\"f1\",\"accuracy\"],inplace=True,ascending=False)\n", + "results_compare" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Section 5: model tuning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "6.1 tune the Logistic regression model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "lr_grid = GridSearchCV(lr, [{\"penalty\":[\"none\",\"l1\",\"l2\"], \"C\":[0, 5, 10]}], scoring = \"accuracy\", cv=10)\n", + "lr_grid.fit(X_train,y_train)\n", + "best_accuracy = lr_grid.best_score_\n", + "bset_parameters = lr_grid.best_params_\n", + "print(\"model\", lr, \"\\n best accuracy is:\", best_accuracy)\n", + "print(\"best parameters are:\", bset_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report\n", + "y_pred = lr_grid.predict(X_test)\n", + "print(classification_report(y_test,y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfm = confusion_matrix(y_test, y_pred)\n", + "sns.heatmap(cfm, annot=True, fmt = 'd', cmap='Blues')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "6.2 tune the XGB model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xgb_grid = GridSearchCV(rf, [{\"n_estimators\":[100,150,200], \"criterion\": [\"gini\", \"entropy\"], \"random_state\": [0]}], scoring = \"accuracy\", cv=10)\n", + "xgb_grid.fit(X_train,y_train)\n", + "best_accuracy = xgb_grid.best_score_\n", + "bset_parameters = xgb_grid.best_params_\n", + "print(\"model\", rf, \"\\n best accuracy is:\", best_accuracy)\n", + "print(\"parameters are:\", bset_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = xgb_grid.predict(X_test)\n", + "print(classification_report(y_test,y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the rf confusion matrix\n", + "cfm = confusion_matrix(y_test, y_pred)\n", + "sns.heatmap(cfm, annot=True, fmt = 'd', cmap='Blues')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "svm_grid = GridSearchCV(svm, [{'kernel':['sigmoid','poly'],'C':[0, 5, 10],'gamma':[0.001, 0.0001]}], scoring = \"accuracy\", cv=10)\n", + "svm_grid.fit(X_train,y_train)\n", + "best_accuracy = svm_grid.best_score_\n", + "bset_parameters = svm_grid.best_params_\n", + "print(\"model\", svm, \"\\n best accuracy is:\", best_accuracy)\n", + "print(\"parameters are:\", bset_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = svm_grid.predict(X_test)\n", + "print(classification_report(y_test,y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the rf confusion matrix\n", + "cfm = confusion_matrix(y_test, y_pred)\n", + "sns.heatmap(cfm, annot=True, fmt = 'd', cmap='Blues')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "865d8b2eb28e274047ba64063dfb6a2aabf0dfec4905d304d7a76618dae6fdd4" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}