Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
6006CEM_CODE/code_no_markdown.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1150 lines (842 sloc)
49.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 6006CEM Machine Learning and Related Applications | |
#Name: Mohammed Fardhin Masud | |
#Student ID: 10355929 | |
# Import required libraries | |
import time | |
import pandas as pd | |
import numpy as np | |
from matplotlib import pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, GridSearchCV | |
from sklearn.preprocessing import LabelEncoder, StandardScaler | |
from sklearn.metrics import precision_recall_curve, roc_curve, accuracy_score | |
from sklearn.metrics import classification_report, confusion_matrix | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.svm import LinearSVC | |
from xgboost import XGBClassifier | |
from sklearn.naive_bayes import GaussianNB | |
print("Imported libraries successfully") | |
# Data Import, Analysis and Preprocessing | |
# Convert data to pandas DataFrame | |
dataset_path = "data/weatherAUS.csv" | |
df = pd.read_csv(dataset_path, index_col=False) | |
# Show dataframe info | |
df.info() | |
# Show the first 10 rows of the data | |
df.head(10) | |
# Numerical columns already have a numerical datatype so we do not need to covert them. | |
# We will be predicting the RainTomorrow column in this project. The data is already mostly clean but some preprocessing is still required. | |
# Check for duplicated data - https://towardsdatascience.com/the-ultimate-4-step-guide-to-clean-data-bd25f2f57956 | |
df.duplicated().sum() # Returns 0, so there are no duplicates in the data | |
df['RainTomorrow'].nunique() # Returns 2, so there are only 2 unique values in the RainTomorrow column as expected | |
# Plot the number of days that it rained and did not rain in the original dataset using countplot | |
sns.countplot(x="RainTomorrow", data=df) | |
plt.title("Original Dataset: RainTomorrow Column") | |
print(df["RainTomorrow"].value_counts()) # No - 110316 Yes - 31877 = 142193 | |
total_values = 142193 # Total number of values in the dataset excluding NA values | |
rain = df.loc[df['RainTomorrow'] == "Yes"].shape[0] | |
rain_ratio = rain / total_values | |
print(f"Chance of a day having RainTomorrow set to Yes (Original Dataset): {rain_ratio * 100:.2f}% - {rain} days out of {total_values}") | |
# Preprocessing data | |
# Remove any rows with NA values (missing data) | |
df.dropna(inplace=True) | |
# Split the Date column into Year, Month and Day columns | |
df['Date'] = pd.to_datetime(df['Date']) # Convert Date into a DateTime object | |
df['Year'] = df['Date'].dt.year | |
df['Month'] = df['Date'].dt.month | |
df['Day'] = df['Date'].dt.day | |
# Drop the Date column | |
df.drop('Date', axis=1, inplace=True) | |
# Encode WindDir, RainToday and RainTomorrow data using label encoding (Aula, Week 4) | |
le = LabelEncoder() | |
# Compass direction e.g. N, E, S, W | |
df['WindGustDir'] = le.fit_transform(df['WindGustDir']) | |
df['WindDir9am'] = le.fit_transform(df['WindDir9am']) | |
df['WindDir3pm'] = le.fit_transform(df['WindDir3pm']) | |
# Boolean values e.g. 1=Yes, No | |
df['RainToday'] = le.fit_transform(df['RainToday']) | |
df['RainTomorrow'] = le.fit_transform(df['RainTomorrow']) | |
df.info() # The "object" data types have been converted to int32 except for Location which is going to be one hot encoded later | |
# Graph showing th correlation between the different features | |
plt.figure(figsize=(20, 20)) | |
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f") | |
plt.show() | |
# Graph showing the correlation between the different features and the target value | |
plt.figure(figsize=(4, 8)) | |
sns.heatmap(df.corr(numeric_only=True)[['RainTomorrow']].sort_values(by='RainTomorrow', ascending=False), annot=True, fmt=".2f") | |
plt.show() | |
# This shows that the target column is most correlated with Humiditiy3pm and Sunshine of the other columns (excluding Location), but they are both still weak correlations (positive and negative respectively.) | |
## Outlier Detection | |
#This article helped detect outliers in the data: | |
#https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba | |
# List of columns to check for outliers | |
columns_to_check = df.columns.tolist() | |
columns_to_check.remove('Location') | |
columns_to_check.remove('RainTomorrow') | |
columns_to_check.remove('Year') | |
columns_to_check.remove('Month') | |
columns_to_check.remove('Day') | |
columns_to_check.remove('RainToday') | |
print("Columns to check for outliers:", columns_to_check) | |
"""# Commented out to avoid bloating the notebook too much but I used this to find the outliers in the columns | |
# Create a boxplot for each column | |
for column in columns_to_check: | |
sns.boxplot(x=df[column]) | |
plt.show() | |
""" | |
# Outliers detected in the following columns: | |
# Evaporation, WindGustSpeed, Humidity9am, Pressure9am, Pressure3pm, Temp3pm | |
outlier_columns = ['Evaporation', 'WindGustSpeed', 'Humidity9am', 'Pressure9am', 'Pressure3pm', 'Temp3pm'] | |
original_df = df.copy() | |
# Remove outliers from the dataset by imputing the median value for each outlier | |
for column in outlier_columns: | |
q1 = df[column].quantile(0.25) | |
q3 = df[column].quantile(0.75) | |
IQR = q3 - q1 | |
lower_bound = q1 - (1.5 * IQR) | |
upper_bound = q3 + (1.5 * IQR) | |
df.loc[(df[column] < lower_bound) | (df[column] > upper_bound), column] = df[column].median() | |
# Plot boxplots side by side to compare the original and modified data | |
plt.tight_layout() # adjusts subplot parameters automatically | |
fig, (ax1, ax2) = plt.subplots(1, 2) | |
fig.size = (20, 10) | |
fig.suptitle(f"{column}: Original vs Modified") | |
sns.boxplot(x=original_df[column], ax=ax1) | |
sns.boxplot(x=df[column], ax=ax2) | |
ax1.set_title("Original") | |
ax2.set_title("Modified") | |
plt.show() | |
# List of locations | |
Locations = np.unique(df['Location']) | |
# print("List of locations with no NAN values:", Locations) | |
# For each location, list the number of days with and without rain (the target value) | |
locations_dict = {} | |
for location in Locations: | |
total_values = df[(df['Location'] == location)].shape[0] | |
no_rain = df[(df['Location'] == location) & (df['RainTomorrow'] == 0)].shape[0] | |
rain = df[(df['Location'] == location) & (df['RainTomorrow'] == 1)].shape[0] | |
rain_ratio = rain / total_values | |
locations_dict[location] = [ | |
total_values, | |
no_rain, | |
rain, | |
rain_ratio | |
] | |
avg_rain, num_locations = 0, 0 | |
no_rain_total, rain_total, all_total = 0, 0, 0 | |
for key, value in locations_dict.items(): | |
all_total += value[0] | |
no_rain_total += value[1] | |
rain_total += value[2] | |
avg_rain += value[3] | |
num_locations += 1 | |
#print(key, value) | |
sns.countplot(x="RainTomorrow", data=df) | |
plt.title("Modified Dataset: RainTomorrow Column") | |
print(f"Chance of a day having RainTomorrow set to 1 (Modified Dataset): {avg_rain/num_locations*100:.2f}% - {rain_total} days out of {all_total}") # 21.85% of the days predict rain - similar to the original dataset of (22.42%) | |
## Scaling, One Hot Encoding and Train Test Split | |
# Columns to scale using StandardScaler | |
columns_to_scale = df.columns.tolist() | |
columns_to_scale.remove('Location') | |
columns_to_scale.remove('RainTomorrow') | |
columns_to_scale.remove('RainToday') | |
#scaler = StandardScaler() | |
#df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale]) | |
print("Number of columns before one hot encoding:", len(df.columns)) | |
# Use One hot encoding to encode the Location column, otherwise some models will fail to train | |
df = pd.get_dummies(df, columns=['Location']) | |
# Split the data into training and testing sets | |
split_size = 0.2 | |
X = df.drop(['RainTomorrow'], axis = 1) # ALL columns except for RainTomorrow | |
y = df['RainTomorrow'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_size, random_state = 0) | |
# The previous training and test data may have suffer from data leakage as the data was standardised before the split. | |
# This is because the mean and standard deviation of the training data was used to standardise the test data. | |
# Due to this, the model could have inferred information about the test data from the training data without having seen it. | |
# To fix this, the data is being standardised after the split. | |
# In testing, this didn't actually make a noticeable difference, perhaps only 1 or 2 extra true positives/negatives. | |
# Standardise the training and test data | |
scaler = StandardScaler() | |
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale]) | |
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale]) # only transform the test data, not fit it | |
print(f"Training data shapes:\nX:{X_train.shape}\nY:{y_train.shape}") | |
print(f"Testing data shapes:\nX:{X_test.shape}\nY:{y_test.shape}") | |
print("Number of columns after one hot encoding:", X_train.shape[1]) | |
# print first 5 rows of training data | |
X_train.head() | |
# Machine Learning Algorithms | |
#In this section, we will be using the following classification algorithms: | |
#1. Logistic Regression (LR) | |
#2. Multilayer Perceptron (MLP) | |
#3. Extreme Gradient Boosting (XGB) | |
#4. Support Vector Machine (SVM) | |
#5. Naive Bayes (NB) - Gaussian | |
#I have chosen these classifiers as I wanted to try out different types of classifiers and see how they perform on the data. Early expectations are that the MLP and XGB will perform the best as they are tree and neural based algorithms and Naive Bayes will perform the worst as it is a probabilistic classifier. | |
## Machine Learning Algorithm 1: Logistic Regression | |
#I will refer to this as LR. | |
#LR was introduced in week 3 of the resources on Aula and is a linear model. | |
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html | |
# Machine Learning Algorithm 1: Logistic Regression | |
# max_iter=200 to allow lbfgs to converge | |
LR = LogisticRegression(random_state = 0) | |
start_time = time.time() | |
LR.fit(X_train, y_train) | |
LR_fit_time = time.time() - start_time | |
print(f"Logistic Regression fit time: {LR_fit_time:.2f}s") | |
y_pred_test_LR = LR.predict(X_test) | |
LR_accuracy = accuracy_score(y_test, y_pred_test_LR) | |
print(f'Logistic Regression accuracy score: {LR_accuracy*100:.2f}%') | |
## Machine Learning Algorithm 2: Multi-layer Perceptron | |
#I will refer to this as MLP. | |
#MLPClassifier was introduced in Week 4 and is a neural network based model. | |
# Machine Learning Algorithm 2: Multi-layer Perceptron (MLP) Classifier | |
# the default max_iter is 200 and this still doesn't converge so I'm not going to change it - It is taking too long to train. | |
MLP = MLPClassifier(random_state=0) | |
start_time = time.time() | |
MLP.fit(X_train, y_train) | |
MLP_fit_time = time.time() - start_time | |
print(f"MLP Classifier fit time: {MLP_fit_time:.2f}s") | |
y_pred_test_MLP = MLP.predict(X_test) | |
MLP_accuracy = accuracy_score(y_test, y_pred_test_MLP) | |
print(f'MLP Classifier accuracy score: {MLP_accuracy*100:.2f}%') | |
## Machine Learning Algorithm 3: Extreme Gradient Boosting (XGBoost) | |
#I will refer to this as XGB. This is not taught on Aula so I thought this would be good to show something better than the "base" #models. | |
#Other models exist such as LightGBM and CatBoost which are also tree based models but I have chosen XGB as it is the most popular. | |
#XGBoost can also automatically handle missing values and categorical variables but for this project it will be using the already preprocessed data to keep things fair. | |
#Some of the resources I used to learn about this algorithm: | |
#- https://xgboost.readthedocs.io/en/stable/tutorials/model.html | |
#- https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/ | |
#- https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn | |
#- https://www.kaggle.com/code/stuarthallows/using-xgboost-with-scikit-learn/notebook | |
#- https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py | |
XGB = XGBClassifier(random_state=0) | |
start_time = time.time() | |
XGB.fit(X_train, y_train) | |
XGB_fit_time = time.time() - start_time | |
print(f"XGB fit time: {XGB_fit_time:.2f}s") | |
y_pred_test_XGB = XGB.predict(X_test) | |
XGB_accuracy = accuracy_score(y_test, y_pred_test_XGB) | |
print(f'XGB accuracy score: {XGB_accuracy*100:.2f}%') | |
## Machine Learning Algorithm 4: Support Vector Machine (SVM) | |
#I will refer to this as SVM. | |
#SVM was introduced in week 8 of the resources on Aula and is also a linear model like LR. | |
#For SVM, I used the LinearSVC (`sklearn.svm.LinearSVC`) model instead of SVC (`sklearn.svm.SVC`) with a linear kernel as the latter #takes a long time to run (but is more accurate). | |
#Below are some additional resources I used to learn about SVM: | |
#- https://towardsdatascience.com/svm-and-kernel-svm-fed02bef1200 | |
#- https://scikit-learn.org/stable/modules/svm.html | |
#- https://scikit-learn.org/stable/modules/svm.html#unbalanced-problems (specifically class imbalance and class_weight) | |
#- https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html | |
#- https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use (Class Imbalance, standardisation, tuning parameters, etc.) | |
# Machine Learning Algorithm 4: Support Vector Machine (SVM) | |
SVM = LinearSVC(random_state=0) | |
start_time = time.time() | |
SVM.fit(X_train, y_train) | |
SVM_fit_time = time.time() - start_time | |
print(f"SVM fit time: {SVM_fit_time:.2f}s") | |
y_pred_test_SVM = SVM.predict(X_test) | |
SVM_accuracy = accuracy_score(y_test, y_pred_test_SVM) | |
print(f'SVM accuracy score: {SVM_accuracy*100:.2f}%') | |
## Machine Learning Algorithm 5: Naive Bayes | |
#I will refer to this as NB. | |
#NB was introduced in week 9 of the resources on Aula and is a probabilistic model. | |
#Since this model is probabilistic this will run very fast. Unfortunately, the accuracy will be poor as the data in this project is not a good fit - It is a mix of qualitative and quantitative data despite being encoded as numbers. Still, the results will be interesting to see. | |
# Machine Learning Algorithm 5: Naive Bayes (NB) Classifier | |
NB = GaussianNB() | |
start_time = time.time() | |
NB.fit(X_train, y_train) | |
NB_fit_time = time.time() - start_time | |
print(f"NB fit time: {NB_fit_time:.2f}s") | |
y_pred_test_NB = NB.predict(X_test) | |
NB_accuracy = accuracy_score(y_test, y_pred_test_NB) | |
print(f'NB accuracy score: {NB_accuracy*100:.2f}%') | |
# Pre-Tuning Model Evaluation | |
#In this section, I will show the key metrics such as precision and recall via sklearn's built-in classification_report function. | |
#A confusion matrix will also be generated using the same library, but displayed via seaborn's heatmap plot which is essentially identical to `sklearn.metrics.ConfusionMatrixDisplay` | |
#Model Evaluation is covered in week 5 of the resources on Aula but the solutions have yet to be provided so I am not sure of the perfect approach to go with. | |
#As there is a small class imbalance (4:1 Negative:Positive) I am going to use a precision-recall curve instead of a ROC graph and calculating its AUC. | |
#The precision-recall curve code was adapted from https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/ as I could not find any solutions from Aula. | |
#I will still use the ROC curve to compare the models visually, but I will not use the AUC score as a key metric or comparison graph. | |
#After tuning the models, I will perform another final evaluation and choose the best model for the task. | |
## Default Logistic Regression Classifier | |
# Classification Report LR | |
print("Classification Report for Logistic Regression") | |
print(classification_report(y_test, y_pred_test_LR, target_names=["No Rain", "Rain"])) | |
# Confusion Matrix LR | |
cm_LR = confusion_matrix(y_test, y_pred_test_LR) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_LR, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues', xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"]) | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
LR_CM_title = 'LR Accuracy: {0:.2f}'.format(LR_accuracy) | |
plt.title(LR_CM_title, size = 15) | |
# show the plot | |
plt.show() | |
## Default Multi-layer Perceptron Classifier | |
# Classification Report MLP Classifier | |
print("Classification Report for Multi Layer Perceptron Classifier") | |
print(classification_report(y_test, y_pred_test_MLP)) | |
# Confusion Matrix MLP Classifier | |
cm_MLP = confusion_matrix(y_test, y_pred_test_MLP) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_MLP, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
MLP_CM_title = 'MLP Accuracy: {0:.2f}'.format(MLP_accuracy) | |
plt.title(MLP_CM_title, size = 15) | |
plt.show() | |
## Default XGBoost Classifier | |
# Classification Report XGB Classifier | |
print("Classification Report for XGB Classifier") | |
print(classification_report(y_test, y_pred_test_XGB)) | |
# Confusion Matrix XGB Classifier | |
cm_XGB = confusion_matrix(y_test, y_pred_test_XGB) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_XGB, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
XGB_CM_title = 'XGB Accuracy: {0:.2f}'.format(XGB_accuracy) | |
plt.title(XGB_CM_title, size = 15) | |
plt.show() | |
## Default Support Vector Classifier | |
# Classification Report SVM | |
print("Classification Report for SVM") | |
print(classification_report(y_test, y_pred_test_SVM)) | |
# Confusion Matrix SVM | |
cm_SVM = confusion_matrix(y_test, y_pred_test_SVM) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_SVM, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
SVM_CM_title = 'SVM Accuracy: {0:.2f}'.format(SVM_accuracy) | |
plt.title(SVM_CM_title, size = 15) | |
plt.show() | |
## Default Gaussian Naive Bayes Classifier | |
# Classification Report NB | |
print("Classification Report for NB") | |
print(classification_report(y_test, y_pred_test_NB)) | |
# Confusion Matrix NB | |
cm_NB = confusion_matrix(y_test, y_pred_test_NB) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_NB, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
NB_CM_title = 'NB Accuracy: {0:.2f}'.format(NB_accuracy) | |
plt.title(NB_CM_title, size = 15) | |
plt.show() | |
## Precision-Recall Curve and ROC AUC | |
#Resources: | |
#https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python | |
# Precision Recall Curves | |
# LR Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR) | |
plt.plot(recall, precision, marker='.', label='Logistic Regression', color='Purple') | |
# MLP Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_MLP) | |
plt.plot(recall, precision, marker='o', label='MLP Classifier', color='Red') | |
# XGB Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB) | |
plt.plot(recall, precision, marker='*', label='XGB Classifier', color='Green') | |
# SVM Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM) | |
plt.plot(recall, precision, marker='+', label='SVM', color='Blue') | |
# NB Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_NB) | |
plt.plot(recall, precision, marker='x', label='Naive Bayes', color='Orange') | |
plt.title('Precision-Recall Curve (Before Tuning)') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
plt.legend() | |
plt.show() | |
# ROC Curves | |
# LR ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR) | |
plt.plot(fpr, tpr, marker='.', label='Logistic Regression', color='Purple') | |
# MLP ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_MLP) | |
plt.plot(fpr, tpr, marker='o', label='MLP Classifier', color='Red') | |
# XGB ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB) | |
plt.plot(fpr, tpr, marker='*', label='XGB Classifier', color='Green') | |
# SVM ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM) | |
plt.plot(fpr, tpr, marker='+', label='SVM', color='Blue') | |
# NB ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_NB) | |
plt.plot(fpr, tpr, marker='x', label='Naive Bayes', color='Orange') | |
# No Skill | |
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill') | |
plt.title('ROC Curve (Before Tuning)') | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.legend() | |
plt.show() | |
## Pre Model Tuning Conclusion | |
#The evaluation metrics show that the XGB Classifier and Logistic Regression Classifier are the best models for this task at the moment as they return very high accuracy from the short time taken to train on the data. After data standardisation, all models performed better especially SVM which used to take a large amount of computation time. | |
#Before standardising the data (using `sklearn.preprocessing.StandardScaler`), the original SVM Classifier (`sklearn.svm.SVC`) was not a good model as it would keep misclassifying the rain instances as no rain. What it was essentially doing was predicting no rain for all instances which was not worth the training time. | |
#After standardising the data however, the original SVM Classifier was able to predict rain instances correctly as the data was now closer together, meaning the algorithm takes less computation time to make calculations between points. Still, the LinearSVC (`sklearn.svm.LinearSVC`) model was able to perform much faster than the original SVM Classifier at the cost of a small drop in accuracy. To improve the Original SVM Classifier, I could also have experimented with different kernels such as the RBF kernel and kernel approximation. | |
#The MLP Classifier is also an effective model after normalising the data (as shown in the precision-recall curve graph), but it takes a long time to train on the data. This is most likely because it is a neural network based model and it has to train on the data multiple times to find the best weights. Due to these concerns, I will not be tuning this model further compared to the other more time efficient models that seem to do a similar job. | |
#The Naive Bayes model is not performing well - as expected - and will not be tuned further. | |
# Model Tuning | |
#In this section, I will perform hyperparameter tuning on the models to see if I can improve their performance, with a focus on computational time and accuracy. | |
#For Cross Validation, I will use Stratified K-Fold as the dataset is slightly imbalanced. I will use 3 folds as this should be enough for the dataset size. By using RepeatStratifiedKFold, I can repeat the cross validation to get a more accurate result and reduce the variance in the results. I will use the weighted f1 score as the scoring metric as it is a good metric for the dataset by taking into account the class imbalance. | |
#To tune the models, I will use GridSearchCV and RandomizedSearchCV (Both from scikit-learn) to find the best parameters for each model. The steps taken are as follows: | |
#1. Find an initial set of parameters using RandomizedSearchCV and see which ones are performing poorly and remove them from the search space. | |
#2. Use GridSearchCV to find the best parameters for the remaining parameters | |
#3. Repeat step 2 until the best parameters are found for each model | |
#4. Evaluate the models using the best parameters and compare the results | |
#I will be using the original train test split from the beginning of the project as I will be using the test set to evaluate the models after tuning. The verbose parameter in the GridSearchCV and RandomizedSearchCV functions will be set to show the progress of the tuning process and random state will be set to 0 for reproducibility. | |
#For the actual hyperparameter tuning, I will be tuning the XGB Classifier, Logistic Regression Classifier and SVM Classifier as they are the best performing models at the moment with regards to computation time (MLP Classifier takes a significant amount of time, but does produce good results).I will still provide the code for tuning MLP and NB as the code is very similar to the other models. | |
#- For LR, I will try different solvers, C values, number of iterations and the class_weight parameter. Originally the model did not converge within 100 iterations, so this will be tried out to see if the model with alternate parameters can converge within 100 iterations. | |
#- For MLP, I would have tried different activation functions, solvers, alpha values and hidden layer sizes and the code is provided for this. I will not be tuning this model as it takes a long time to train. The results from this model were very good and is close to the XGB Classifier as per the evaluation metrics in the previous section. | |
#- For XGB, I will try different values for the learning rate, max depth, and n_estimators. There are more parameters that can be tuned but I want to keep the tuning time low. | |
#- For SVM, since I am now using the LinearSVC model, I will try different values for the C parameter, the loss function and the penalty. For the original SVC I would have tried different C values and tried different kernels. | |
#- For NB, I will not tune the model as it will not work well with the data until it is fully standardised. I will still provide the code for evaluating a tuned model for the next section. | |
#Below is the final iteration of the code for tuning the models.I will not be showing the iterations involved as that would take up too much space in this notebook. | |
#Unfortunately, setting a verbose value higher than 1 doesn't affect the output for me, most likely due to `n_jobs` also being greater than 1 indicating that the models are being trained in parallel. | |
## Logistic Regression Model Tuning | |
print("Default LR Params", LR.get_params()) | |
# Grid Search Cross Validation | |
LR_v2 = LogisticRegression(random_state=0) | |
# Chosen parameters: | |
solvers = ['lbfgs'] | |
c_values = [100] | |
class_weights = [None] | |
n_iter = [100] | |
parameters = dict(solver=solvers, C=c_values, class_weight=class_weights, max_iter=n_iter) | |
skfold_LR_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=0) | |
grid_search_LR = GridSearchCV(estimator=LR_v2, param_grid=parameters, n_jobs=-1, cv=skfold_LR_v2, scoring='f1_weighted', verbose=3) | |
LR_CV_CLF = grid_search_LR.fit(X_train, y_train) | |
print("Best: %f using %s" % (LR_CV_CLF.best_score_, LR_CV_CLF.best_params_)) | |
means = LR_CV_CLF.cv_results_['mean_test_score'] | |
stds = LR_CV_CLF.cv_results_['std_test_score'] | |
params = LR_CV_CLF.cv_results_['params'] | |
mean_fit_time = LR_CV_CLF.cv_results_['mean_fit_time'] | |
mean_score_time = LR_CV_CLF.cv_results_['mean_score_time'] | |
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time): | |
print("="*30) | |
print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s") | |
# Tuning notes: | |
# - "saga" solver seems to be the slowest across the board. | |
# - lbfgs and liblinear are similar in time but liblinear is scoring higher | |
# - Setting "class_weight" to 'balanced' is inferior to keeping it as 'None' | |
# - Trying additional C Values was not beneficial - 0.1 Seems to be the best. | |
# Since the two solvers seem to be similar, I will choose lbfgs as my final solver (the default). This is because although it scores slightly lower than liblinear, It is always faster. | |
# Chosen params: {'C': 100, 'class_weight': None, 'solver': 'lbfgs'} - Larger C Value and solver set to lbfgs increases performance at the cost of a penalty to score (f1_weighted). The difference in results is 0.01% score for a 2x speedup in time. | |
## MLPClassifier Tuning (Not Performed) | |
#This has been commented out, but the code is provided to show how I would have tuned the MLP Classifier if I had more time and resources for this project. | |
if 0: | |
"""# MLP Classifier Hyperparameter Tuning | |
MLP_v2 = MLPClassifier(random_state=0, verbose=4) | |
activation = ['relu', 'tanh', 'identity', 'logistic'] | |
solver = ['lbfgs', 'sgd', 'adam'] | |
alpha = [0.0001, 0.05, 0.1] | |
learning_rate = ['constant', 'adaptive'] | |
hidden_layers = [(50, 50, 50), (100,)] | |
parameters = dict(activation=activation, solver=solver, alpha=alpha, learning_rate=learning_rate, hidden_layer_sizes=hidden_layers) | |
random_search = RandomizedSearchCV(estimator=MLP_v2, param_distributions=parameters, n_iter=40, n_jobs=-1, cv=skfold_MLP, scoring='f1_weighted', verbose=4) | |
MLP_CV_CLF = random_search.fit(X_train, y_train) | |
# Initially RandomizedSearchCV was used to narrow down the best parameters, then GridSearchCV was used to find the best parameters. This is done automamatically as n_iter is set to 40 - the number of parameter settings that are tried out. | |
skfold_results_MLP_v2 = cross_val_score(MLP_CV_CLF, X, y, cv=skfold_MLP, n_jobs=-1) | |
print("Tuned Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_MLP_v2.mean()*100, skfold_results_MLP_v2.std()*100)) | |
print("Best %f using %s" % (MLP_CV_CLF.best_score_, MLP_CV_CLF.best_params_)) | |
means = MLP_CV_CLF.cv_results_['mean_test_score'] | |
stds = MLP_CV_CLF.cv_results_['std_test_score'] | |
params = MLP_CV_CLF.cv_results_['params'] | |
mean_fit_time = MLP_CV_CLF.cv_results_['mean_fit_time'] | |
mean_score_time = MLP_CV_CLF.cv_results_['mean_score_time'] | |
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time): | |
print("="*30) | |
print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s")""" | |
## XGBClassifier Tuning | |
#Best parameters to try and tune as there are many: | |
# - https://machinelearningmastery.com/tune-number-size-decision-trees-xgboost-python/ | |
# - https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster | |
# - https://stackoverflow.com/questions/65983344/how-to-choose-the-values-of-n-estimators-and-seed-in-xgbregressor | |
print("Default XGB Params\n", XGB.get_params()) | |
# XGB Classifier Hyperparameter Tuning | |
XGB_v2 = XGBClassifier(random_state=0) | |
learning_rate = [0.1] | |
n_estimators = [250] | |
max_depth = [9] | |
# Some more parameters to tune in the future: | |
# min_child_weight = [1, 2, 3, 4, 5] | |
# gamma = [0.0, 0.1, 0.2, 0.3, 0.4] | |
# subsample = [0.6, 0.7, 0.8, 0.9, 1.0] | |
# objective = ['binary:hinge', 'binary:logitraw'] | |
#parameters = dict(learning_rate=learning_rate, max_depth=max_depth) | |
parameters = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth) | |
#parameters = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, objective=objective) | |
skfold_XGB_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0) | |
grid_search_XGB = GridSearchCV(estimator=XGB_v2, param_grid=parameters, n_jobs=-1, cv=skfold_XGB_v2, scoring='f1_weighted', verbose=4) | |
XGB_CV_CLF = grid_search_XGB.fit(X_train, y_train) | |
print("Best %f using %s" % (XGB_CV_CLF.best_score_, XGB_CV_CLF.best_params_)) | |
means = XGB_CV_CLF.cv_results_['mean_test_score'] | |
stds = XGB_CV_CLF.cv_results_['std_test_score'] | |
params = XGB_CV_CLF.cv_results_['params'] | |
mean_fit_time = XGB_CV_CLF.cv_results_['mean_fit_time'] | |
mean_score_time = XGB_CV_CLF.cv_results_['mean_score_time'] | |
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time): | |
print("="*30) | |
print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s") | |
# Comments on the runs | |
if False: | |
"""Initial Random Run: | |
Tuned XGB Classifier Score: 85.84% (0.23%) | |
Best 0.855266 using {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.3} | |
0.854981 (0.001904) with: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.3} | |
0.855266 (0.002031) with: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.3} | |
0.855082 (0.002418) with: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.3} | |
0.855012 (0.002152) with: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.2} | |
- Max depth and learning rate seem to be worth investigating further. n_estimators can be refined later | |
Grid Search: XGB_2.txt | |
Best 0.855527 using {'learning_rate': 0.1, 'max_depth': 10} | |
This confirms that a higher max_depth increases performance as this also had the highest scores. | |
However, a lower learning rate also increased scores (and training time). | |
I will use max depths of 8, 9 and 10 and learning rates 0.1 and 0.2 now with n_estimators to find the best values in the next search. | |
Also setting n_repeats to 2 to save time and reduce total fits needed. | |
XGB_4.txt: | |
max_depth of 9 and 10 did not get any better results, so are being checked further in the next repeat. I will use 4, 6 and 8 to see the effect this makes. | |
n_estimators seems to increase the training time, but comparing 300vs200 for and learning rates shows that higher learning rates negatively affect the score. | |
It will be set to 300 to compare differences in max_depth further | |
XGB_5.txt: | |
max_depth of 8 was enough to reach 0.858 so now I will optimise the learning rate to a decent time and n_estimators again. | |
XGB_6.txt: | |
From this we can see 0.1 is the optimal learning rate and now n_estimators needs to be adjusted | |
XGB_7.txt: | |
n_estimators gave the best performance - time ratio and will finalise the parameters before 1 last check with the max_depth parameter | |
XGB_8.txt: | |
The optimal parameters have been found, but I want to check the max_depth now we have different values, so I am trying 8, 9 and 10 | |
XGB_9.txt: | |
max_depth=9 seems to give the best and most consistent results, so the final parameters are {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 250}. | |
During testing, objective parameters did not produce good results so were not tested further, | |
""" | |
## LinearSVC Model Tuning | |
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html | |
print("Default SVM Params", SVM.get_params()) | |
# SVM Classifier Hyperparameter Tuning | |
SVM_v2 = LinearSVC(random_state=0, dual=False) | |
C = [0.1] | |
penalty = ['l2'] | |
loss = ['squared_hinge'] | |
dual=[False] | |
parameters = dict(C=C, penalty=penalty, loss=loss, dual=dual) | |
skfold_SVM_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0) | |
grid_search_SVC = GridSearchCV(estimator=SVM_v2, param_grid=parameters, n_jobs=-1, cv=skfold_SVM_v2, scoring='f1_weighted', verbose=4) | |
SVM_CV_CLF = grid_search_SVC.fit(X_train, y_train) | |
print("Best %f using %s" % (SVM_CV_CLF.best_score_, SVM_CV_CLF.best_params_)) | |
means = SVM_CV_CLF.cv_results_['mean_test_score'] | |
stds = SVM_CV_CLF.cv_results_['std_test_score'] | |
params = SVM_CV_CLF.cv_results_['params'] | |
mean_fit_time = SVM_CV_CLF.cv_results_['mean_fit_time'] | |
mean_score_time = SVM_CV_CLF.cv_results_['mean_score_time'] | |
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time): | |
print("="*30) | |
print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s") | |
if 0:""" | |
# Naive Bayes Classifier Hyperparameter Tuning | |
NB_v2 = GaussianNB() | |
var_smoothing = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] | |
parameters = dict(var_smoothing=var_smoothing) | |
skfold_NB_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0) | |
grid_search_NB = GridSearchCV(estimator=NB_v2, param_grid=parameters, n_jobs=-1, cv=skfold_NB_v2, scoring='f1_weighted', verbose=4) | |
NB_CV_CLF = grid_search_NB.fit(X_train, y_train) | |
print("Best %f using %s" % (NB_CV_CLF.best_score_, NB_CV_CLF.best_params_)) | |
means = NB_CV_CLF.cv_results_['mean_test_score'] | |
stds = NB_CV_CLF.cv_results_['std_test_score'] | |
params = NB_CV_CLF.cv_results_['params'] | |
mean_fit_time = NB_CV_CLF.cv_results_['mean_fit_time'] | |
mean_score_time = NB_CV_CLF.cv_results_['mean_score_time'] | |
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time): | |
print("="*30) | |
print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s")""" | |
# Tuned Model Evaluation Metrics | |
#Identical to the metrics performed before hyperparameter tuning, this section will evaluate the newly tuned models. I will make use of the StratifiedKFold function from scikit-learn to perform cross validation on the data with weighted f1 score as the scoring metric. | |
# Stratifed K Fold Cross Validation with k=3 and n_repeats=3 | |
# https://machinelearningmastery.com/cross-validation-for-imbalanced-classification/ | |
# Original models: | |
skfold_LR = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0) # k=10 is a popular choice for evaluating a model, but 3 is usually enough for larger datasets so will be used when tuning hyperparameters | |
skfold_results_LR = cross_val_score(LR, X, y, cv=skfold_LR, n_jobs=-1) | |
print(f"Time taken to fit LR: {LR_fit_time:.3f}s") | |
print("Original Logistic Regression Score: %.2f%% (%.2f%%)" % (skfold_results_LR.mean()*100, skfold_results_LR.std()*100)) | |
#skfold_MLP = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=0) | |
#skfold_results_MLP = cross_val_score(MLP, X, y, cv=skfold_MLP, n_jobs=-1) | |
#print("Original MLP Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_MLP.mean()*100, skfold_results_MLP.std()*100)) | |
skfold_XGB = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0) | |
skfold_results_XGB = cross_val_score(XGB, X, y, cv=skfold_XGB, n_jobs=-1) | |
print(f"Time taken to fit XGB: {XGB_fit_time:.3f}s") | |
print("Original XGB Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_XGB.mean()*100, skfold_results_XGB.std()*100)) | |
skfold_SVM = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0) | |
skfold_results_SVM = cross_val_score(SVM, X, y, cv=skfold_SVM, n_jobs=-1) | |
print(f"Time taken to fit SVM: {SVM_fit_time:.3f}s") | |
print("Original SVM Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_SVM.mean()*100, skfold_results_SVM.std()*100)) | |
#skfold_NB = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=0) | |
#skfold_results_NB = cross_val_score(NB, X, y, cv=skfold_NB, n_jobs=-1) | |
#print("Original Naive Bayes Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_NB.mean()*100, skfold_results_NB.std()*100)) | |
print("="*30) | |
# Re-fit and cross validate with tuned parameters | |
# Passing parameters directly - https://stackoverflow.com/a/33119615/19543236 | |
#LR - class_weight and solver are the defaults | |
LR_tuned_params = {'C': 100, | |
'class_weight': None, | |
'solver': 'lbfgs', | |
'random_state': 0 | |
} | |
LR_v2 = LogisticRegression(**LR_tuned_params) | |
start_time = time.time() | |
LR_v2.fit(X_train, y_train) | |
LR_v2_fit_time = time.time() - start_time | |
print(f"Time taken to fit LR_v2: {LR_v2_fit_time:.3f}s") | |
skfold_results_LR_v2 = cross_val_score(LR_v2, X, y, cv=skfold_LR, n_jobs=-1) | |
print("Tuned Logistic Regression Score: %.2f%% (%.2f%%)" % (skfold_results_LR_v2.mean()*100, skfold_results_LR_v2.std()*100)) | |
#XGB | |
XGB_tuned_params = {'learning_rate': 0.1, | |
'max_depth': 9, | |
'n_estimators': 250, | |
'random_state': 0} | |
XGB_v2 = XGBClassifier(**XGB_tuned_params) | |
start_time = time.time() | |
XGB_v2.fit(X_train, y_train) | |
XGB_v2_fit_time = time.time() - start_time | |
print(f"Time taken to fit XGB_v2: {XGB_v2_fit_time:.3f}s") | |
skfold_results_XGB_v2 = cross_val_score(XGB_v2, X, y, cv=skfold_XGB, n_jobs=-1) | |
print("Tuned XGB Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_XGB_v2.mean()*100, skfold_results_XGB_v2.std()*100)) | |
# SVM penalty and loss are the defaults | |
SVM_tuned_params = {'C': 0.1, | |
'dual': False, | |
'loss': 'squared_hinge', | |
'penalty': 'l2', | |
'random_state': 0} | |
SVM_v2 = LinearSVC(**SVM_tuned_params) | |
start_time = time.time() | |
SVM_v2.fit(X_train, y_train) | |
SVM_v2_fit_time = time.time() - start_time | |
print(f"Time taken to fit SVM_v2: {SVM_v2_fit_time:.3f}s") | |
skfold_results_SVM_v2 = cross_val_score(SVM_v2, X, y, cv=skfold_SVM, n_jobs=-1) | |
print("Tuned SVM Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_SVM_v2.mean()*100, skfold_results_SVM_v2.std()*100)) | |
## Tuned Logistic Regression Classifier | |
# classification report for Logistic Regression | |
y_pred_test_LR_v2 = LR_v2.predict(X_test) | |
print(classification_report(y_test, y_pred_test_LR_v2)) | |
cm_LR_v2 = confusion_matrix(y_test, y_pred_test_LR_v2) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_LR_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
LR_v2_accuracy = accuracy_score(y_test, y_pred_test_LR_v2) | |
LR_v2_CM_title = 'LR v2 Accuracy: {0:.2f}'.format(LR_v2_accuracy) | |
plt.title(LR_v2_CM_title, size = 15) | |
plt.show() | |
## Tuned Multi-layer Perceptron Classifier | |
if False: | |
""""# classification report for MLP Classifier | |
y_pred_test_MLP_v2 = MLP_CV_CLF.predict(X_test) | |
print(classification_report(y_test, y_pred_test_MLP_v2)) | |
cm_MLP_v2 = confusion_matrix(y_test, y_pred_test_MLP_v2) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_MLP_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
MLP_v2_accuracy = accuracy_score(y_test, y_pred_test_MLP_v2) | |
MLP_v2_CM_title = 'MLP v2 Accuracy: {0:.2f}'.format(MLP_v2_accuracy) | |
plt.title(MLP_v2_CM_title, size = 15) | |
plt.show()""" | |
## Tuned XGBoost Classifier | |
# classification report for XGB Classifier | |
y_pred_test_XGB_v2 = XGB_v2.predict(X_test) | |
print(classification_report(y_test, y_pred_test_XGB_v2)) | |
cm_XGB_v2 = confusion_matrix(y_test, y_pred_test_XGB_v2) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_XGB_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
XGB_v2_accuracy = accuracy_score(y_test, y_pred_test_XGB_v2) | |
XGB_v2_CM_title = 'XGB v2 Accuracy: {0:.2f}'.format(XGB_v2_accuracy) | |
plt.title(XGB_v2_CM_title, size = 15) | |
plt.show() | |
## Tuned Support Vector Classifier | |
# classification report for SVM Classifier | |
y_pred_test_SVM_v2 = SVM_CV_CLF.predict(X_test) | |
print(classification_report(y_test, y_pred_test_SVM_v2)) | |
cm_SVM_v2 = confusion_matrix(y_test, y_pred_test_SVM_v2) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_SVM_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
SVM_v2_accuracy = accuracy_score(y_test, y_pred_test_SVM_v2) | |
SVM_v2_CM_title = 'SVM v2 Accuracy: {0:.2f}'.format(SVM_v2_accuracy) | |
plt.title(SVM_v2_CM_title, size = 15) | |
plt.show() | |
## Tuned Naive Bayes | |
#This was not a focus, so the code is provided for reference only. | |
if 0: | |
"""# classification report for NB Classifier | |
y_pred_test_NB_v2 = NB_v2.predict(X_test) | |
print(classification_report(y_test, y_pred_test_NB_v2)) | |
cm_NB_v2 = confusion_matrix(y_test, y_pred_test_NB_v2) | |
plt.figure(figsize=(6,6)) | |
sns.heatmap(cm_NB_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues') | |
plt.xlabel('Predicted label') | |
plt.ylabel('Actual label') | |
NB_v2_accuracy = accuracy_score(y_test, y_pred_test_NB_v2) | |
NB_v2_CM_title = 'NB v2 Accuracy: {0:.2f}'.format(NB_v2_accuracy) | |
plt.title(NB_v2_CM_title, size = 15) | |
plt.show()""" | |
## Precision-Recall and ROC AUC for tuned models | |
# LR v2 Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR_v2) | |
plt.plot(recall, precision, marker='.', label='Logistic Regression', color='purple') | |
# MLP v2 Precision Recall Curve | |
#precision, recall, _ = precision_recall_curve(y_test, y_pred_test_MLP_v2) | |
#plt.plot(recall, precision, marker='o', label='Multi-Layer Perceptron Classifier', color='Red') | |
# XGBoost v2 Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB_v2) | |
plt.plot(recall, precision, marker='*', label='XGBoost Classifier', color='Green') | |
# SVM v2 Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM_v2) | |
plt.plot(recall, precision, marker='+', label='Support Vector Machine Classifier', color='Blue') | |
# NB v2 Precision Recall Curve | |
#precision, recall, _ = precision_recall_curve(y_test, y_pred_test_NB_v2) | |
#plt.plot(recall, precision, marker='x', label='Naive Bayes Classifier', color='Orange') | |
plt.title('Precision-Recall Curve (After Tuning)') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
plt.legend() | |
plt.show() | |
# ROC Curves | |
# LR ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR_v2) | |
plt.plot(fpr, tpr, marker='.', label='Logistic Regression v2', color='Purple') | |
# MLP ROC Curve | |
#fpr, tpr, _ = roc_curve(y_test, y_pred_test_MLP_v2) | |
#plt.plot(fpr, tpr, marker='o', label='MLP Classifier v2', color='Red') | |
# XGB ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB_v2) | |
plt.plot(fpr, tpr, marker='*', label='XGB Classifier v2', color='Green') | |
# SVM ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM_v2) | |
plt.plot(fpr, tpr, marker='+', label='LinearSVC v2', color='Blue') | |
# NB ROC Curve | |
#fpr, tpr, _ = roc_curve(y_test, y_pred_test_NB_v2) | |
#plt.plot(fpr, tpr, marker='x', label='Naive Bayes v2', color='Orange') | |
# No Skill | |
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill') | |
plt.title('ROC Curve (After Tuning)') | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.legend() | |
plt.show() | |
# Original vs Tuned Comparison | |
#Comparing the original/default and tuned models. This section is essentially a summary of the previous sections. | |
## LR v1 vs LR v2 Comparison | |
# Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR) | |
plt.plot(recall, precision, marker='.', label='Logistic Regression v1', color='Purple') | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR_v2) | |
plt.plot(recall, precision, marker='o', label='Logistic Regression v2', color='Red') | |
plt.title('Precision-Recall Curve (LR v1 vs LR v2 Comparison)') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
plt.legend() | |
plt.show() | |
# ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR) | |
plt.plot(fpr, tpr, marker='.', label='Logistic Regression v1', color='Purple') | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR_v2) | |
plt.plot(fpr, tpr, marker='o', label='Logistic Regression v2', color='Red') | |
plt.title('ROC Curve (LR v1 vs LR v2 Comparison)') | |
plt.legend() | |
plt.show() | |
# Fit times bar graph | |
plt.bar(['Logistic Regression v1', 'Logistic Regression v2'], [LR_fit_time, LR_v2_fit_time], color=['Purple', 'Red']) | |
plt.title('Fit Times (LR v1 vs LR v2 Comparison)') | |
plt.ylabel('Fit Time (s)') | |
plt.show() | |
## XGB v1 vs XGB v2 Comparison | |
# Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB) | |
plt.plot(recall, precision, marker='.', label='XGB v1', color='Purple') | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB_v2) | |
plt.plot(recall, precision, marker='o', label='XGB v2', color='Red') | |
plt.title('Precision-Recall Curve (XGB v1 vs XGB v2 Comparison)') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
plt.legend() | |
plt.show() | |
# ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB) | |
plt.plot(fpr, tpr, marker='.', label='XGB v1', color='Purple') | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB_v2) | |
plt.plot(fpr, tpr, marker='o', label='XGB v2', color='Red') | |
plt.title('ROC Curve (XGB v1 vs XGB v2 Comparison)') | |
plt.legend() | |
plt.show() | |
# Fit times bar graph | |
plt.bar(['XGB v1', 'XGB v2'], [XGB_fit_time, XGB_v2_fit_time], color=['Purple', 'Red']) | |
plt.title('Fit Times (XGB v1 vs XGB v2 Comparison)') | |
plt.ylabel('Fit Time (s)') | |
plt.show() | |
## SVM v1 vs SVM v2 Comparison | |
#These models perform very similarly, but the tuned model is much faster to fit and predict. | |
# Precision Recall Curve | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM) | |
plt.plot(recall, precision, marker='.', label='SVM v1', color='Purple') | |
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM_v2) | |
plt.plot(recall, precision, marker='o', label='SVM v2', color='Red') | |
plt.title('Precision-Recall Curve (SVM v1 vs SVM v2 Comparison)') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
plt.legend() | |
plt.show() | |
# ROC Curve | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM) | |
plt.plot(fpr, tpr, marker='.', label='SVM v1', color='Purple') | |
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM_v2) | |
plt.plot(fpr, tpr, marker='o', label='SVM v2', color='Red') | |
plt.title('ROC Curve (SVM v1 vs SVM v2 Comparison)') | |
plt.legend() | |
plt.show() | |
# Fit times bar graph | |
plt.bar(['SVM v1', 'SVM v2'], [SVM_fit_time, SVM_v2_fit_time], color=['Purple', 'Red']) | |
plt.title('Fit Times (SVM v1 vs SVM v2 Comparison)') | |
plt.ylabel('Fit Time (s)') | |
plt.show() | |
# LR v1 confusion matrix vs LR v2 confusion matrix | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) | |
sns.heatmap(confusion_matrix(y_test, y_pred_test_LR), annot=True, fmt='d', ax=ax1, cmap='RdPu') | |
# RdPu shows up better than Blues and the default colours on my monitor (https://scipy-cookbook.readthedocs.io/items/Matplotlib_Show_colormaps.html) | |
ax1.set_title('Logistic Regression v1 Confusion Matrix') | |
sns.heatmap(confusion_matrix(y_test, y_pred_test_LR_v2), annot=True, fmt='d', ax=ax2, cmap='RdPu') | |
ax2.set_title('Logistic Regression v2 Confusion Matrix') | |
plt.show() | |
# XGB v1 confusion matrix vs XGB v2 confusion matrix | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) | |
sns.heatmap(confusion_matrix(y_test, y_pred_test_XGB), annot=True, fmt='d', ax=ax1, cmap='RdPu') | |
ax1.set_title('XGB v1 Confusion Matrix') | |
sns.heatmap(confusion_matrix(y_test, y_pred_test_XGB_v2), annot=True, fmt='d', ax=ax2, cmap='RdPu') | |
ax2.set_title('XGB v2 Confusion Matrix') | |
plt.show() | |
# SVM v1 confusion matrix vs SVM v2 confusion matrix | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) | |
sns.heatmap(confusion_matrix(y_test, y_pred_test_SVM), annot=True, fmt='d', ax=ax1, cmap='RdPu') | |
ax1.set_title('SVM v1 Confusion Matrix') | |
sns.heatmap(confusion_matrix(y_test, y_pred_test_SVM_v2), annot=True, fmt='d', ax=ax2, cmap='RdPu') | |
ax2.set_title('SVM v2 Confusion Matrix') | |
plt.show() | |
# LR v1 classification report vs LR v2 classification report | |
print('Logistic Regression v1 Classification Report') | |
print(classification_report(y_test, y_pred_test_LR)) | |
print('Logistic Regression v2 Classification Report') | |
print(classification_report(y_test, y_pred_test_LR_v2)) | |
print("="*50) | |
# XGB v1 classification report vs XGB v2 classification report | |
print('XGB v1 Classification Report') | |
print(classification_report(y_test, y_pred_test_XGB)) | |
print('XGB v2 Classification Report') | |
print(classification_report(y_test, y_pred_test_XGB_v2)) | |
print("="*50) | |
# SVM v1 classification report vs SVM v2 classification report | |
print('SVM v1 Classification Report') | |
print(classification_report(y_test, y_pred_test_SVM)) | |
print('SVM v2 Classification Report') | |
print(classification_report(y_test, y_pred_test_SVM_v2)) |