code_no_markdown.py

# 6006CEM Machine Learning and Related Applications

#Name: Mohammed Fardhin Masud
#Student ID: 10355929

# Import required libraries

import time

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.metrics import precision_recall_curve, roc_curve, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

print("Imported libraries successfully")


# Data Import, Analysis and Preprocessing

# Convert data to pandas DataFrame
dataset_path = "data/weatherAUS.csv"
df = pd.read_csv(dataset_path, index_col=False)

# Show dataframe info
df.info()

# Show the first 10 rows of the data
df.head(10)

# Numerical columns already have a numerical datatype so we do not need to covert them.
# We will be predicting the RainTomorrow column in this project. The data is already mostly clean but some preprocessing is still required.

# Check for duplicated data - https://towardsdatascience.com/the-ultimate-4-step-guide-to-clean-data-bd25f2f57956
df.duplicated().sum()  # Returns 0, so there are no duplicates in the data
df['RainTomorrow'].nunique()  # Returns 2, so there are only 2 unique values in the RainTomorrow column as expected

# Plot the number of days that it rained and did not rain in the original dataset using countplot
sns.countplot(x="RainTomorrow", data=df)
plt.title("Original Dataset: RainTomorrow Column")
print(df["RainTomorrow"].value_counts()) # No - 110316 Yes - 31877 = 142193

total_values = 142193  # Total number of values in the dataset excluding NA values
rain = df.loc[df['RainTomorrow'] == "Yes"].shape[0]
rain_ratio = rain / total_values
print(f"Chance of a day having RainTomorrow set to Yes (Original Dataset): {rain_ratio * 100:.2f}% - {rain} days out of {total_values}")


# Preprocessing data

# Remove any rows with NA values (missing data)
df.dropna(inplace=True)

# Split the Date column into Year, Month and Day columns
df['Date'] = pd.to_datetime(df['Date'])  # Convert Date into a DateTime object
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Drop the Date column
df.drop('Date', axis=1, inplace=True)

# Encode WindDir, RainToday and RainTomorrow data using label encoding (Aula, Week 4)
le = LabelEncoder()

# Compass direction e.g. N, E, S, W
df['WindGustDir'] = le.fit_transform(df['WindGustDir'])
df['WindDir9am'] = le.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = le.fit_transform(df['WindDir3pm'])

# Boolean values e.g. 1=Yes, No
df['RainToday'] = le.fit_transform(df['RainToday'])
df['RainTomorrow'] = le.fit_transform(df['RainTomorrow'])

df.info()  # The "object" data types have been converted to int32 except for Location which is going to be one hot encoded later


# Graph showing th correlation between the different features
plt.figure(figsize=(20, 20))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f")
plt.show()

# Graph showing the correlation between the different features and the target value
plt.figure(figsize=(4, 8))
sns.heatmap(df.corr(numeric_only=True)[['RainTomorrow']].sort_values(by='RainTomorrow', ascending=False), annot=True, fmt=".2f")
plt.show()

# This shows that the target column is most correlated with Humiditiy3pm and Sunshine of the other columns (excluding Location), but they are both still weak correlations (positive and negative respectively.)

## Outlier Detection

#This article helped detect outliers in the data:
#https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba

# List of columns to check for outliers
columns_to_check = df.columns.tolist()
columns_to_check.remove('Location')
columns_to_check.remove('RainTomorrow')
columns_to_check.remove('Year')
columns_to_check.remove('Month')
columns_to_check.remove('Day')
columns_to_check.remove('RainToday')

print("Columns to check for outliers:", columns_to_check)

"""# Commented out to avoid bloating the notebook too much but I used this to find the outliers in the columns
# Create a boxplot for each column
for column in columns_to_check:
    sns.boxplot(x=df[column])
    plt.show()
"""

# Outliers detected in the following columns:
# Evaporation, WindGustSpeed, Humidity9am, Pressure9am, Pressure3pm, Temp3pm

outlier_columns = ['Evaporation', 'WindGustSpeed', 'Humidity9am', 'Pressure9am', 'Pressure3pm', 'Temp3pm']

original_df = df.copy()
# Remove outliers from the dataset by imputing the median value for each outlier
for column in outlier_columns:
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    IQR = q3 - q1
    lower_bound = q1 - (1.5 * IQR)
    upper_bound = q3 + (1.5 * IQR)
    df.loc[(df[column] < lower_bound) | (df[column] > upper_bound), column] = df[column].median()

    # Plot boxplots side by side to compare the original and modified data
    plt.tight_layout()  # adjusts subplot parameters automatically
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.size = (20, 10)
    fig.suptitle(f"{column}: Original vs Modified")

    sns.boxplot(x=original_df[column], ax=ax1)
    sns.boxplot(x=df[column], ax=ax2)

    ax1.set_title("Original")
    ax2.set_title("Modified")

    plt.show()


# List of locations
Locations = np.unique(df['Location'])
# print("List of locations with no NAN values:", Locations)

# For each location, list the number of days with and without rain (the target value)
locations_dict = {}
for location in Locations:

    total_values = df[(df['Location'] == location)].shape[0]
    no_rain = df[(df['Location'] == location) & (df['RainTomorrow'] == 0)].shape[0]
    rain = df[(df['Location'] == location) & (df['RainTomorrow'] == 1)].shape[0]

    rain_ratio = rain / total_values

    locations_dict[location] = [
        total_values,
        no_rain,
        rain,
        rain_ratio
        ]

avg_rain, num_locations = 0, 0
no_rain_total, rain_total, all_total = 0, 0, 0
for key, value in locations_dict.items():
    all_total += value[0]
    no_rain_total += value[1]
    rain_total += value[2]
    avg_rain += value[3]
    num_locations += 1
    #print(key, value)

sns.countplot(x="RainTomorrow", data=df)
plt.title("Modified Dataset: RainTomorrow Column")
print(f"Chance of a day having RainTomorrow set to 1 (Modified Dataset): {avg_rain/num_locations*100:.2f}% - {rain_total} days out of {all_total}")  # 21.85% of the days predict rain - similar to the original dataset of (22.42%)


## Scaling, One Hot Encoding and Train Test Split

# Columns to scale using StandardScaler
columns_to_scale = df.columns.tolist()
columns_to_scale.remove('Location')
columns_to_scale.remove('RainTomorrow')
columns_to_scale.remove('RainToday')
#scaler = StandardScaler()
#df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

print("Number of columns before one hot encoding:", len(df.columns))

# Use One hot encoding to encode the Location column, otherwise some models will fail to train
df = pd.get_dummies(df, columns=['Location'])

# Split the data into training and testing sets
split_size = 0.2

X = df.drop(['RainTomorrow'], axis = 1)  # ALL columns except for RainTomorrow
y = df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_size, random_state = 0)

# The previous training and test data may have suffer from data leakage as the data was standardised before the split.
# This is because the mean and standard deviation of the training data was used to standardise the test data.
# Due to this, the model could have inferred information about the test data from the training data without having seen it.
# To fix this, the data is being standardised after the split.
# In testing, this didn't actually make a noticeable difference, perhaps only 1 or 2 extra true positives/negatives.

# Standardise the training and test data
scaler = StandardScaler()
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])  # only transform the test data, not fit it

print(f"Training data shapes:\nX:{X_train.shape}\nY:{y_train.shape}")
print(f"Testing data shapes:\nX:{X_test.shape}\nY:{y_test.shape}")

print("Number of columns after one hot encoding:", X_train.shape[1])


# print first 5 rows of training data
X_train.head()

# Machine Learning Algorithms

#In this section, we will be using the following classification algorithms:
#1. Logistic Regression (LR)
#2. Multilayer Perceptron (MLP)
#3. Extreme Gradient Boosting (XGB)
#4. Support Vector Machine (SVM)
#5. Naive Bayes (NB) - Gaussian

#I have chosen these classifiers as I wanted to try out different types of classifiers and see how they perform on the data. Early expectations are that the MLP and XGB will perform the best as they are tree and neural based algorithms and Naive Bayes will perform the worst as it is a probabilistic classifier.

## Machine Learning Algorithm 1: Logistic Regression

#I will refer to this as LR.
#LR was introduced in week 3 of the resources on Aula and is a linear model.

#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# Machine Learning Algorithm 1: Logistic Regression
# max_iter=200 to allow lbfgs to converge
LR = LogisticRegression(random_state = 0)
start_time = time.time()
LR.fit(X_train, y_train)
LR_fit_time = time.time() - start_time
print(f"Logistic Regression fit time: {LR_fit_time:.2f}s")

y_pred_test_LR = LR.predict(X_test)
LR_accuracy = accuracy_score(y_test, y_pred_test_LR)
print(f'Logistic Regression accuracy score: {LR_accuracy*100:.2f}%')

## Machine Learning Algorithm 2: Multi-layer Perceptron

#I will refer to this as MLP.
#MLPClassifier was introduced in Week 4 and is a neural network based model.

# Machine Learning Algorithm 2: Multi-layer Perceptron (MLP) Classifier
# the default max_iter is 200 and this still doesn't converge so I'm not going to change it - It is taking too long to train.
MLP = MLPClassifier(random_state=0)
start_time = time.time()
MLP.fit(X_train, y_train)
MLP_fit_time = time.time() - start_time
print(f"MLP Classifier fit time: {MLP_fit_time:.2f}s")

y_pred_test_MLP = MLP.predict(X_test)
MLP_accuracy = accuracy_score(y_test, y_pred_test_MLP)
print(f'MLP Classifier accuracy score: {MLP_accuracy*100:.2f}%')

## Machine Learning Algorithm 3: Extreme Gradient Boosting (XGBoost)

#I will refer to this as XGB. This is not taught on Aula so I thought this would be good to show something better than the "base" #models.
#Other models exist such as LightGBM and CatBoost which are also tree based models but I have chosen XGB as it is the most popular.
#XGBoost can also automatically handle missing values and categorical variables but for this project it will be using the already preprocessed data to keep things fair.

#Some of the resources I used to learn about this algorithm:
#- https://xgboost.readthedocs.io/en/stable/tutorials/model.html
#- https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
#- https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
#- https://www.kaggle.com/code/stuarthallows/using-xgboost-with-scikit-learn/notebook
#- https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py

XGB = XGBClassifier(random_state=0)

start_time = time.time()
XGB.fit(X_train, y_train)
XGB_fit_time = time.time() - start_time
print(f"XGB fit time: {XGB_fit_time:.2f}s")

y_pred_test_XGB = XGB.predict(X_test)
XGB_accuracy = accuracy_score(y_test, y_pred_test_XGB)
print(f'XGB accuracy score: {XGB_accuracy*100:.2f}%')

## Machine Learning Algorithm 4: Support Vector Machine (SVM)

#I will refer to this as SVM.
#SVM was introduced in week 8 of the resources on Aula and is also a linear model like LR.

#For SVM, I used the LinearSVC (`sklearn.svm.LinearSVC`) model instead of SVC (`sklearn.svm.SVC`) with a linear kernel as the latter #takes a long time to run (but is more accurate).

#Below are some additional resources I used to learn about SVM:
#- https://towardsdatascience.com/svm-and-kernel-svm-fed02bef1200
#- https://scikit-learn.org/stable/modules/svm.html
#- https://scikit-learn.org/stable/modules/svm.html#unbalanced-problems (specifically class imbalance and class_weight)
#- https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
#- https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use (Class Imbalance, standardisation, tuning parameters, etc.)


# Machine Learning Algorithm 4: Support Vector Machine (SVM)

SVM = LinearSVC(random_state=0)
start_time = time.time()
SVM.fit(X_train, y_train)
SVM_fit_time = time.time() - start_time
print(f"SVM fit time: {SVM_fit_time:.2f}s")

y_pred_test_SVM = SVM.predict(X_test)
SVM_accuracy = accuracy_score(y_test, y_pred_test_SVM)
print(f'SVM accuracy score: {SVM_accuracy*100:.2f}%')

## Machine Learning Algorithm 5: Naive Bayes

#I will refer to this as NB.
#NB was introduced in week 9 of the resources on Aula and is a probabilistic model.

#Since this model is probabilistic this will run very fast. Unfortunately, the accuracy will be poor as the data in this project is not a good fit - It is a mix of qualitative and quantitative data despite being encoded as numbers. Still, the results will be interesting to see.


# Machine Learning Algorithm 5: Naive Bayes (NB) Classifier

NB = GaussianNB()

start_time = time.time()
NB.fit(X_train, y_train)
NB_fit_time = time.time() - start_time
print(f"NB fit time: {NB_fit_time:.2f}s")

y_pred_test_NB = NB.predict(X_test)
NB_accuracy = accuracy_score(y_test, y_pred_test_NB)
print(f'NB accuracy score: {NB_accuracy*100:.2f}%')

# Pre-Tuning Model Evaluation

#In this section, I will show the key metrics such as precision and recall via sklearn's built-in classification_report function.
#A confusion matrix will also be generated using the same library, but displayed via seaborn's heatmap plot which is essentially identical to `sklearn.metrics.ConfusionMatrixDisplay`

#Model Evaluation is covered in week 5 of the resources on Aula but the solutions have yet to be provided so I am not sure of the perfect approach to go with.

#As there is a small class imbalance (4:1 Negative:Positive) I am going to use a precision-recall curve instead of a ROC graph and calculating its AUC.
#The precision-recall curve code was adapted from https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/ as I could not find any solutions from Aula.

#I will still use the ROC curve to compare the models visually, but I will not use the AUC score as a key metric or comparison graph.

#After tuning the models, I will perform another final evaluation and choose the best model for the task.

## Default Logistic Regression Classifier

# Classification Report LR
print("Classification Report for Logistic Regression")
print(classification_report(y_test, y_pred_test_LR, target_names=["No Rain", "Rain"]))

# Confusion Matrix LR
cm_LR = confusion_matrix(y_test, y_pred_test_LR)

plt.figure(figsize=(6,6))
sns.heatmap(cm_LR, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues', xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"])

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

LR_CM_title = 'LR Accuracy: {0:.2f}'.format(LR_accuracy)
plt.title(LR_CM_title, size = 15)

# show the plot
plt.show()

## Default Multi-layer Perceptron Classifier

# Classification Report MLP Classifier
print("Classification Report for Multi Layer Perceptron Classifier")
print(classification_report(y_test, y_pred_test_MLP))

# Confusion Matrix MLP Classifier
cm_MLP = confusion_matrix(y_test, y_pred_test_MLP)

plt.figure(figsize=(6,6))
sns.heatmap(cm_MLP, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

MLP_CM_title = 'MLP Accuracy: {0:.2f}'.format(MLP_accuracy)
plt.title(MLP_CM_title, size = 15)

plt.show()

## Default XGBoost Classifier

# Classification Report XGB Classifier
print("Classification Report for XGB Classifier")
print(classification_report(y_test, y_pred_test_XGB))

# Confusion Matrix XGB Classifier
cm_XGB = confusion_matrix(y_test, y_pred_test_XGB)

plt.figure(figsize=(6,6))
sns.heatmap(cm_XGB, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

XGB_CM_title = 'XGB Accuracy: {0:.2f}'.format(XGB_accuracy)
plt.title(XGB_CM_title, size = 15)

plt.show()

## Default Support Vector Classifier

# Classification Report SVM
print("Classification Report for SVM")
print(classification_report(y_test, y_pred_test_SVM))

# Confusion Matrix SVM
cm_SVM = confusion_matrix(y_test, y_pred_test_SVM)

plt.figure(figsize=(6,6))
sns.heatmap(cm_SVM, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

SVM_CM_title = 'SVM Accuracy: {0:.2f}'.format(SVM_accuracy)
plt.title(SVM_CM_title, size = 15)

plt.show()

## Default Gaussian Naive Bayes Classifier

# Classification Report NB
print("Classification Report for NB")
print(classification_report(y_test, y_pred_test_NB))

# Confusion Matrix NB
cm_NB = confusion_matrix(y_test, y_pred_test_NB)

plt.figure(figsize=(6,6))
sns.heatmap(cm_NB, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

NB_CM_title = 'NB Accuracy: {0:.2f}'.format(NB_accuracy)
plt.title(NB_CM_title, size = 15)

plt.show()

## Precision-Recall Curve and ROC AUC

#Resources:
#https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python


# Precision Recall Curves

# LR Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR)
plt.plot(recall, precision, marker='.', label='Logistic Regression', color='Purple')
# MLP Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_MLP)
plt.plot(recall, precision, marker='o', label='MLP Classifier', color='Red')
# XGB Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB)
plt.plot(recall, precision, marker='*', label='XGB Classifier', color='Green')
# SVM Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM)
plt.plot(recall, precision, marker='+', label='SVM', color='Blue')
# NB Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_NB)
plt.plot(recall, precision, marker='x', label='Naive Bayes', color='Orange')

plt.title('Precision-Recall Curve (Before Tuning)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

# ROC Curves

# LR ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR)
plt.plot(fpr, tpr, marker='.', label='Logistic Regression', color='Purple')
# MLP ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_MLP)
plt.plot(fpr, tpr, marker='o', label='MLP Classifier', color='Red')
# XGB ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB)
plt.plot(fpr, tpr, marker='*', label='XGB Classifier', color='Green')
# SVM ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM)
plt.plot(fpr, tpr, marker='+', label='SVM', color='Blue')
# NB ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_NB)
plt.plot(fpr, tpr, marker='x', label='Naive Bayes', color='Orange')

# No Skill
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')

plt.title('ROC Curve (Before Tuning)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

## Pre Model Tuning Conclusion

#The evaluation metrics show that the XGB Classifier and Logistic Regression Classifier are the best models for this task at the moment as they return very high accuracy from the short time taken to train on the data. After data standardisation, all models performed better especially SVM which used to take a large amount of computation time.

#Before standardising the data (using `sklearn.preprocessing.StandardScaler`), the original SVM Classifier (`sklearn.svm.SVC`) was not a good model as it would keep misclassifying the rain instances as no rain. What it was essentially doing was predicting no rain for all instances which was not worth the training time.

#After standardising the data however, the original SVM Classifier was able to predict rain instances correctly as the data was now closer together, meaning the algorithm takes less computation time to make calculations between points. Still, the LinearSVC (`sklearn.svm.LinearSVC`) model was able to perform much faster than the original SVM Classifier at the cost of a small drop in accuracy. To improve the Original SVM Classifier, I could also have experimented with different kernels such as the RBF kernel and kernel approximation.

#The MLP Classifier is also an effective model after normalising the data (as shown in the precision-recall curve graph), but it takes a long time to train on the data. This is most likely because it is a neural network based model and it has to train on the data multiple times to find the best weights. Due to these concerns, I will not be tuning this model further compared to the other more time efficient models that seem to do a similar job.

#The Naive Bayes model is not performing well - as expected - and will not be tuned further.

# Model Tuning

#In this section, I will perform hyperparameter tuning on the models to see if I can improve their performance, with a focus on computational time and accuracy.

#For Cross Validation, I will use Stratified K-Fold as the dataset is slightly imbalanced. I will use 3 folds as this should be enough for the dataset size. By using RepeatStratifiedKFold, I can repeat the cross validation to get a more accurate result and reduce the variance in the results. I will use the weighted f1 score as the scoring metric as it is a good metric for the dataset by taking into account the class imbalance.

#To tune the models, I will use GridSearchCV and RandomizedSearchCV (Both from scikit-learn) to find the best parameters for each model. The steps taken are as follows:
#1. Find an initial set of parameters using RandomizedSearchCV and see which ones are performing poorly and remove them from the search space.
#2. Use GridSearchCV to find the best parameters for the remaining parameters
#3. Repeat step 2 until the best parameters are found for each model
#4. Evaluate the models using the best parameters and compare the results

#I will be using the original train test split from the beginning of the project as I will be using the test set to evaluate the models after tuning. The verbose parameter in the GridSearchCV and RandomizedSearchCV functions will be set to show the progress of the tuning process and random state will be set to 0 for reproducibility.

#For the actual hyperparameter tuning, I will be tuning the XGB Classifier, Logistic Regression Classifier and SVM Classifier as they are the best performing models at the moment with regards to computation time (MLP Classifier takes a significant amount of time, but does produce good results).I will still provide the code for tuning MLP and NB as the code is very similar to the other models.

#- For LR, I will try different solvers, C values, number of iterations and the class_weight parameter. Originally the model did not converge within 100 iterations, so this will be tried out to see if the model with alternate parameters can converge within 100 iterations.
#- For MLP, I would have tried different activation functions, solvers, alpha values and hidden layer sizes and the code is provided for this. I will not be tuning this model as it takes a long time to train. The results from this model were very good and is close to the XGB Classifier as per the evaluation metrics in the previous section.
#- For XGB, I will try different values for the learning rate, max depth, and n_estimators. There are more parameters that can be tuned but I want to keep the tuning time low.
#- For SVM, since I am now using the LinearSVC model, I will try different values for the C parameter, the loss function and the penalty. For the original SVC I would have tried different C values and tried different kernels.
#- For NB, I will not tune the model as it will not work well with the data until it is fully standardised. I will still provide the code for evaluating a tuned model for the next section.

#Below is the final iteration of the code for tuning the models.I will not be showing the iterations involved as that would take up too much space in this notebook.
#Unfortunately, setting a verbose value higher than 1 doesn't affect the output for me, most likely due to `n_jobs` also being greater than 1 indicating that the models are being trained in parallel.


## Logistic Regression Model Tuning

print("Default LR Params", LR.get_params())

# Grid Search Cross Validation

LR_v2 = LogisticRegression(random_state=0)

# Chosen parameters:
solvers = ['lbfgs']
c_values = [100]
class_weights = [None]
n_iter = [100]

parameters = dict(solver=solvers, C=c_values, class_weight=class_weights, max_iter=n_iter)
skfold_LR_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=0)

grid_search_LR = GridSearchCV(estimator=LR_v2, param_grid=parameters, n_jobs=-1, cv=skfold_LR_v2, scoring='f1_weighted', verbose=3)
LR_CV_CLF = grid_search_LR.fit(X_train, y_train)

print("Best: %f using %s" % (LR_CV_CLF.best_score_, LR_CV_CLF.best_params_))
means = LR_CV_CLF.cv_results_['mean_test_score']
stds = LR_CV_CLF.cv_results_['std_test_score']
params = LR_CV_CLF.cv_results_['params']
mean_fit_time = LR_CV_CLF.cv_results_['mean_fit_time']
mean_score_time = LR_CV_CLF.cv_results_['mean_score_time']
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time):
    print("="*30)
    print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s")

# Tuning notes:
# - "saga" solver seems to be the slowest across the board.
# - lbfgs and liblinear are similar in time but liblinear is scoring higher
# - Setting "class_weight" to 'balanced' is inferior to keeping it as 'None'
# - Trying additional C Values was not beneficial - 0.1 Seems to be the best.
# Since the two solvers seem to be similar, I will choose lbfgs as my final solver (the default). This is because although it scores slightly lower than liblinear, It is always faster.

# Chosen params: {'C': 100, 'class_weight': None, 'solver': 'lbfgs'} - Larger C Value and solver set to lbfgs increases performance at the cost of a penalty to score (f1_weighted). The difference in results is 0.01% score for a 2x speedup in time.

## MLPClassifier Tuning (Not Performed)

#This has been commented out, but the code is provided to show how I would have tuned the MLP Classifier if I had more time and resources for this project.

if 0:
    """# MLP Classifier Hyperparameter Tuning
MLP_v2 = MLPClassifier(random_state=0, verbose=4)

activation = ['relu', 'tanh', 'identity', 'logistic']
solver = ['lbfgs', 'sgd', 'adam']
alpha = [0.0001, 0.05, 0.1]
learning_rate = ['constant', 'adaptive']
hidden_layers = [(50, 50, 50), (100,)]

parameters = dict(activation=activation, solver=solver, alpha=alpha, learning_rate=learning_rate, hidden_layer_sizes=hidden_layers)

random_search = RandomizedSearchCV(estimator=MLP_v2, param_distributions=parameters, n_iter=40, n_jobs=-1, cv=skfold_MLP, scoring='f1_weighted', verbose=4)
MLP_CV_CLF = random_search.fit(X_train, y_train)
# Initially RandomizedSearchCV was used to narrow down the best parameters, then GridSearchCV was used to find the best parameters. This is done automamatically as n_iter is set to 40 - the number of parameter settings that are tried out.

skfold_results_MLP_v2 = cross_val_score(MLP_CV_CLF, X, y, cv=skfold_MLP, n_jobs=-1)
print("Tuned Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_MLP_v2.mean()*100, skfold_results_MLP_v2.std()*100))

print("Best %f using %s" % (MLP_CV_CLF.best_score_, MLP_CV_CLF.best_params_))
means = MLP_CV_CLF.cv_results_['mean_test_score']
stds = MLP_CV_CLF.cv_results_['std_test_score']
params = MLP_CV_CLF.cv_results_['params']
mean_fit_time = MLP_CV_CLF.cv_results_['mean_fit_time']
mean_score_time = MLP_CV_CLF.cv_results_['mean_score_time']
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time):
    print("="*30)
    print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s")"""


## XGBClassifier Tuning

#Best parameters to try and tune as there are many:
# - https://machinelearningmastery.com/tune-number-size-decision-trees-xgboost-python/
# - https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster
# - https://stackoverflow.com/questions/65983344/how-to-choose-the-values-of-n-estimators-and-seed-in-xgbregressor

print("Default XGB Params\n", XGB.get_params())

# XGB Classifier Hyperparameter Tuning

XGB_v2 = XGBClassifier(random_state=0)

learning_rate = [0.1]
n_estimators = [250]
max_depth = [9]
# Some more parameters to tune in the future:
# min_child_weight = [1, 2, 3, 4, 5]
# gamma = [0.0, 0.1, 0.2, 0.3, 0.4]
# subsample = [0.6, 0.7, 0.8, 0.9, 1.0]
# objective = ['binary:hinge', 'binary:logitraw']

#parameters = dict(learning_rate=learning_rate, max_depth=max_depth)
parameters = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
#parameters = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, objective=objective)

skfold_XGB_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)

grid_search_XGB = GridSearchCV(estimator=XGB_v2, param_grid=parameters, n_jobs=-1, cv=skfold_XGB_v2, scoring='f1_weighted', verbose=4)
XGB_CV_CLF = grid_search_XGB.fit(X_train, y_train)

print("Best %f using %s" % (XGB_CV_CLF.best_score_, XGB_CV_CLF.best_params_))
means = XGB_CV_CLF.cv_results_['mean_test_score']
stds = XGB_CV_CLF.cv_results_['std_test_score']
params = XGB_CV_CLF.cv_results_['params']
mean_fit_time = XGB_CV_CLF.cv_results_['mean_fit_time']
mean_score_time = XGB_CV_CLF.cv_results_['mean_score_time']
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time):
    print("="*30)
    print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s")

# Comments on the runs
if False:
    """Initial Random Run:

    Tuned XGB Classifier Score: 85.84% (0.23%)
    Best 0.855266 using {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.3}
    0.854981 (0.001904) with: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.3}
    0.855266 (0.002031) with: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.3}
    0.855082 (0.002418) with: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.3}
    0.855012 (0.002152) with: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.2}

    - Max depth and learning rate seem to be worth investigating further. n_estimators can be refined later

    Grid Search: XGB_2.txt
    Best 0.855527 using {'learning_rate': 0.1, 'max_depth': 10}

    This confirms that a higher max_depth increases performance as this also had the highest scores.
    However, a lower learning rate also increased scores (and training time).
    I will use max depths of 8, 9 and 10 and learning rates 0.1 and 0.2 now with n_estimators to find the best values in the next search.
    Also setting n_repeats to 2 to save time and reduce total fits needed.

    XGB_4.txt:
    max_depth of 9 and 10 did not get any better results, so are being checked further in the next repeat. I will use 4, 6 and 8 to see the effect this makes.
    n_estimators seems to increase the training time, but comparing 300vs200 for and learning rates shows that higher learning rates negatively affect the score.
    It will be set to 300 to compare differences in max_depth further

    XGB_5.txt:
    max_depth of 8 was enough to reach 0.858 so now I will optimise the learning rate to a decent time and n_estimators again.

    XGB_6.txt:
    From this we can see 0.1 is the optimal learning rate and now n_estimators needs to be adjusted

    XGB_7.txt:
    n_estimators gave the best performance - time ratio and will finalise the parameters before 1 last check with the max_depth parameter

    XGB_8.txt:
    The optimal parameters have been found, but I want to check the max_depth now we have different values, so I am trying 8, 9 and 10

    XGB_9.txt:
    max_depth=9 seems to give the best and most consistent results, so the final parameters are {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 250}.

    During testing, objective parameters did not produce good results so were not tested further,

    """


## LinearSVC Model Tuning

#https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

print("Default SVM Params", SVM.get_params())

# SVM Classifier Hyperparameter Tuning

SVM_v2 = LinearSVC(random_state=0, dual=False)

C = [0.1]
penalty = ['l2']
loss = ['squared_hinge']
dual=[False]

parameters = dict(C=C, penalty=penalty, loss=loss, dual=dual)
skfold_SVM_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)

grid_search_SVC = GridSearchCV(estimator=SVM_v2, param_grid=parameters, n_jobs=-1, cv=skfold_SVM_v2, scoring='f1_weighted', verbose=4)
SVM_CV_CLF = grid_search_SVC.fit(X_train, y_train)

print("Best %f using %s" % (SVM_CV_CLF.best_score_, SVM_CV_CLF.best_params_))
means = SVM_CV_CLF.cv_results_['mean_test_score']
stds = SVM_CV_CLF.cv_results_['std_test_score']
params = SVM_CV_CLF.cv_results_['params']
mean_fit_time = SVM_CV_CLF.cv_results_['mean_fit_time']
mean_score_time = SVM_CV_CLF.cv_results_['mean_score_time']
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time):
    print("="*30)
    print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s")


if 0:"""
# Naive Bayes Classifier Hyperparameter Tuning

NB_v2 = GaussianNB()

var_smoothing = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]

parameters = dict(var_smoothing=var_smoothing)
skfold_NB_v2 = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)

grid_search_NB = GridSearchCV(estimator=NB_v2, param_grid=parameters, n_jobs=-1, cv=skfold_NB_v2, scoring='f1_weighted', verbose=4)
NB_CV_CLF = grid_search_NB.fit(X_train, y_train)

print("Best %f using %s" % (NB_CV_CLF.best_score_, NB_CV_CLF.best_params_))
means = NB_CV_CLF.cv_results_['mean_test_score']
stds = NB_CV_CLF.cv_results_['std_test_score']
params = NB_CV_CLF.cv_results_['params']
mean_fit_time = NB_CV_CLF.cv_results_['mean_fit_time']
mean_score_time = NB_CV_CLF.cv_results_['mean_score_time']
for mean, stdev, param, fit_time, score_time in zip(means, stds, params, mean_fit_time, mean_score_time):
    print("="*30)
    print(f"Mean Score: {mean:.6f} (std: {stdev:.3f}) with: {param}\nMean Fit Time: {fit_time:.3f}s\nMean Score Time: {score_time:.3f}s")"""

# Tuned Model Evaluation Metrics

#Identical to the metrics performed before hyperparameter tuning, this section will evaluate the newly tuned models. I will make use of the StratifiedKFold function from scikit-learn to perform cross validation on the data with weighted f1 score as the scoring metric.


# Stratifed K Fold Cross Validation with k=3 and n_repeats=3
# https://machinelearningmastery.com/cross-validation-for-imbalanced-classification/

# Original models:

skfold_LR = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)  # k=10 is a popular choice for evaluating a model, but 3 is usually enough for larger datasets so will be used when tuning hyperparameters
skfold_results_LR = cross_val_score(LR, X, y, cv=skfold_LR, n_jobs=-1)
print(f"Time taken to fit LR: {LR_fit_time:.3f}s")
print("Original Logistic Regression Score: %.2f%% (%.2f%%)" % (skfold_results_LR.mean()*100, skfold_results_LR.std()*100))

#skfold_MLP = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=0)
#skfold_results_MLP = cross_val_score(MLP, X, y, cv=skfold_MLP, n_jobs=-1)
#print("Original MLP Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_MLP.mean()*100, skfold_results_MLP.std()*100))

skfold_XGB = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)
skfold_results_XGB = cross_val_score(XGB, X, y, cv=skfold_XGB, n_jobs=-1)
print(f"Time taken to fit XGB: {XGB_fit_time:.3f}s")
print("Original XGB Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_XGB.mean()*100, skfold_results_XGB.std()*100))

skfold_SVM = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)
skfold_results_SVM = cross_val_score(SVM, X, y, cv=skfold_SVM, n_jobs=-1)
print(f"Time taken to fit SVM: {SVM_fit_time:.3f}s")
print("Original SVM Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_SVM.mean()*100, skfold_results_SVM.std()*100))

#skfold_NB = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=0)
#skfold_results_NB = cross_val_score(NB, X, y, cv=skfold_NB, n_jobs=-1)
#print("Original Naive Bayes Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_NB.mean()*100, skfold_results_NB.std()*100))

print("="*30)

# Re-fit and cross validate with tuned parameters

# Passing parameters directly - https://stackoverflow.com/a/33119615/19543236

#LR - class_weight and solver are the defaults
LR_tuned_params = {'C': 100,
                   'class_weight': None,
                   'solver': 'lbfgs',
                   'random_state': 0
                   }
LR_v2 = LogisticRegression(**LR_tuned_params)
start_time = time.time()
LR_v2.fit(X_train, y_train)
LR_v2_fit_time = time.time() - start_time
print(f"Time taken to fit LR_v2: {LR_v2_fit_time:.3f}s")
skfold_results_LR_v2 = cross_val_score(LR_v2, X, y, cv=skfold_LR, n_jobs=-1)
print("Tuned Logistic Regression Score: %.2f%% (%.2f%%)" % (skfold_results_LR_v2.mean()*100, skfold_results_LR_v2.std()*100))

#XGB
XGB_tuned_params = {'learning_rate': 0.1,
                    'max_depth': 9,
                    'n_estimators': 250,
                    'random_state': 0}
XGB_v2 = XGBClassifier(**XGB_tuned_params)
start_time = time.time()
XGB_v2.fit(X_train, y_train)
XGB_v2_fit_time = time.time() - start_time
print(f"Time taken to fit XGB_v2: {XGB_v2_fit_time:.3f}s")
skfold_results_XGB_v2 = cross_val_score(XGB_v2, X, y, cv=skfold_XGB, n_jobs=-1)
print("Tuned XGB Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_XGB_v2.mean()*100, skfold_results_XGB_v2.std()*100))

# SVM penalty and loss are the defaults
SVM_tuned_params = {'C': 0.1,
                    'dual': False,
                    'loss': 'squared_hinge',
                    'penalty': 'l2',
                    'random_state': 0}
SVM_v2 = LinearSVC(**SVM_tuned_params)
start_time = time.time()
SVM_v2.fit(X_train, y_train)
SVM_v2_fit_time = time.time() - start_time
print(f"Time taken to fit SVM_v2: {SVM_v2_fit_time:.3f}s")
skfold_results_SVM_v2 = cross_val_score(SVM_v2, X, y, cv=skfold_SVM, n_jobs=-1)
print("Tuned SVM Classifier Score: %.2f%% (%.2f%%)" % (skfold_results_SVM_v2.mean()*100, skfold_results_SVM_v2.std()*100))


## Tuned Logistic Regression Classifier

# classification report for Logistic Regression
y_pred_test_LR_v2 = LR_v2.predict(X_test)
print(classification_report(y_test, y_pred_test_LR_v2))

cm_LR_v2 = confusion_matrix(y_test, y_pred_test_LR_v2)

plt.figure(figsize=(6,6))
sns.heatmap(cm_LR_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

LR_v2_accuracy = accuracy_score(y_test, y_pred_test_LR_v2)
LR_v2_CM_title = 'LR v2 Accuracy: {0:.2f}'.format(LR_v2_accuracy)
plt.title(LR_v2_CM_title, size = 15)

plt.show()

## Tuned Multi-layer Perceptron Classifier

if False:
    """"# classification report for MLP Classifier
    y_pred_test_MLP_v2 = MLP_CV_CLF.predict(X_test)
    print(classification_report(y_test, y_pred_test_MLP_v2))

    cm_MLP_v2 = confusion_matrix(y_test, y_pred_test_MLP_v2)

    plt.figure(figsize=(6,6))
    sns.heatmap(cm_MLP_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

    plt.xlabel('Predicted label')
    plt.ylabel('Actual label')

    MLP_v2_accuracy = accuracy_score(y_test, y_pred_test_MLP_v2)
    MLP_v2_CM_title = 'MLP v2 Accuracy: {0:.2f}'.format(MLP_v2_accuracy)
    plt.title(MLP_v2_CM_title, size = 15)

    plt.show()"""

## Tuned XGBoost Classifier

# classification report for XGB Classifier
y_pred_test_XGB_v2 = XGB_v2.predict(X_test)
print(classification_report(y_test, y_pred_test_XGB_v2))

cm_XGB_v2 = confusion_matrix(y_test, y_pred_test_XGB_v2)

plt.figure(figsize=(6,6))
sns.heatmap(cm_XGB_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

XGB_v2_accuracy = accuracy_score(y_test, y_pred_test_XGB_v2)
XGB_v2_CM_title = 'XGB v2 Accuracy: {0:.2f}'.format(XGB_v2_accuracy)
plt.title(XGB_v2_CM_title, size = 15)

plt.show()

## Tuned Support Vector Classifier

# classification report for SVM Classifier
y_pred_test_SVM_v2 = SVM_CV_CLF.predict(X_test)
print(classification_report(y_test, y_pred_test_SVM_v2))

cm_SVM_v2 = confusion_matrix(y_test, y_pred_test_SVM_v2)

plt.figure(figsize=(6,6))
sns.heatmap(cm_SVM_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

SVM_v2_accuracy = accuracy_score(y_test, y_pred_test_SVM_v2)
SVM_v2_CM_title = 'SVM v2 Accuracy: {0:.2f}'.format(SVM_v2_accuracy)
plt.title(SVM_v2_CM_title, size = 15)

plt.show()

## Tuned Naive Bayes

#This was not a focus, so the code is provided for reference only.

if 0:
    """# classification report for NB Classifier
y_pred_test_NB_v2 = NB_v2.predict(X_test)
print(classification_report(y_test, y_pred_test_NB_v2))

cm_NB_v2 = confusion_matrix(y_test, y_pred_test_NB_v2)

plt.figure(figsize=(6,6))
sns.heatmap(cm_NB_v2, annot=True, fmt=".3f", linewidths=.5, square = True, cmap='Blues')

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

NB_v2_accuracy = accuracy_score(y_test, y_pred_test_NB_v2)
NB_v2_CM_title = 'NB v2 Accuracy: {0:.2f}'.format(NB_v2_accuracy)
plt.title(NB_v2_CM_title, size = 15)

plt.show()"""

## Precision-Recall and ROC AUC for tuned models

# LR v2 Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR_v2)
plt.plot(recall, precision, marker='.', label='Logistic Regression', color='purple')
# MLP v2 Precision Recall Curve
#precision, recall, _ = precision_recall_curve(y_test, y_pred_test_MLP_v2)
#plt.plot(recall, precision, marker='o', label='Multi-Layer Perceptron Classifier', color='Red')
# XGBoost v2 Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB_v2)
plt.plot(recall, precision, marker='*', label='XGBoost Classifier', color='Green')
# SVM v2 Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM_v2)
plt.plot(recall, precision, marker='+', label='Support Vector Machine Classifier', color='Blue')
# NB v2 Precision Recall Curve
#precision, recall, _ = precision_recall_curve(y_test, y_pred_test_NB_v2)
#plt.plot(recall, precision, marker='x', label='Naive Bayes Classifier', color='Orange')

plt.title('Precision-Recall Curve (After Tuning)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

# ROC Curves

# LR ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR_v2)
plt.plot(fpr, tpr, marker='.', label='Logistic Regression v2', color='Purple')
# MLP ROC Curve
#fpr, tpr, _ = roc_curve(y_test, y_pred_test_MLP_v2)
#plt.plot(fpr, tpr, marker='o', label='MLP Classifier v2', color='Red')
# XGB ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB_v2)
plt.plot(fpr, tpr, marker='*', label='XGB Classifier v2', color='Green')
# SVM ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM_v2)
plt.plot(fpr, tpr, marker='+', label='LinearSVC v2', color='Blue')
# NB ROC Curve
#fpr, tpr, _ = roc_curve(y_test, y_pred_test_NB_v2)
#plt.plot(fpr, tpr, marker='x', label='Naive Bayes v2', color='Orange')

# No Skill
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')

plt.title('ROC Curve (After Tuning)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Original vs Tuned Comparison

#Comparing the original/default and tuned models. This section is essentially a summary of the previous sections.

## LR v1 vs LR v2 Comparison

# Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR)
plt.plot(recall, precision, marker='.', label='Logistic Regression v1', color='Purple')
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_LR_v2)
plt.plot(recall, precision, marker='o', label='Logistic Regression v2', color='Red')

plt.title('Precision-Recall Curve (LR v1 vs LR v2 Comparison)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR)
plt.plot(fpr, tpr, marker='.', label='Logistic Regression v1', color='Purple')
fpr, tpr, _ = roc_curve(y_test, y_pred_test_LR_v2)
plt.plot(fpr, tpr, marker='o', label='Logistic Regression v2', color='Red')
plt.title('ROC Curve (LR v1 vs LR v2 Comparison)')
plt.legend()
plt.show()

# Fit times bar graph
plt.bar(['Logistic Regression v1', 'Logistic Regression v2'], [LR_fit_time, LR_v2_fit_time], color=['Purple', 'Red'])
plt.title('Fit Times (LR v1 vs LR v2 Comparison)')
plt.ylabel('Fit Time (s)')
plt.show()


## XGB v1 vs XGB v2 Comparison

# Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB)
plt.plot(recall, precision, marker='.', label='XGB v1', color='Purple')
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_XGB_v2)
plt.plot(recall, precision, marker='o', label='XGB v2', color='Red')

plt.title('Precision-Recall Curve (XGB v1 vs XGB v2 Comparison)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB)
plt.plot(fpr, tpr, marker='.', label='XGB v1', color='Purple')
fpr, tpr, _ = roc_curve(y_test, y_pred_test_XGB_v2)
plt.plot(fpr, tpr, marker='o', label='XGB v2', color='Red')
plt.title('ROC Curve (XGB v1 vs XGB v2 Comparison)')
plt.legend()
plt.show()

# Fit times bar graph
plt.bar(['XGB v1', 'XGB v2'], [XGB_fit_time, XGB_v2_fit_time], color=['Purple', 'Red'])
plt.title('Fit Times (XGB v1 vs XGB v2 Comparison)')
plt.ylabel('Fit Time (s)')
plt.show()

## SVM v1 vs SVM v2 Comparison

#These models perform very similarly, but the tuned model is much faster to fit and predict.

# Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM)
plt.plot(recall, precision, marker='.', label='SVM v1', color='Purple')
precision, recall, _ = precision_recall_curve(y_test, y_pred_test_SVM_v2)
plt.plot(recall, precision, marker='o', label='SVM v2', color='Red')

plt.title('Precision-Recall Curve (SVM v1 vs SVM v2 Comparison)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM)
plt.plot(fpr, tpr, marker='.', label='SVM v1', color='Purple')
fpr, tpr, _ = roc_curve(y_test, y_pred_test_SVM_v2)
plt.plot(fpr, tpr, marker='o', label='SVM v2', color='Red')
plt.title('ROC Curve (SVM v1 vs SVM v2 Comparison)')
plt.legend()
plt.show()

# Fit times bar graph
plt.bar(['SVM v1', 'SVM v2'], [SVM_fit_time, SVM_v2_fit_time], color=['Purple', 'Red'])
plt.title('Fit Times (SVM v1 vs SVM v2 Comparison)')
plt.ylabel('Fit Time (s)')
plt.show()

# LR v1 confusion matrix vs LR v2 confusion matrix
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_test_LR), annot=True, fmt='d', ax=ax1, cmap='RdPu')
# RdPu shows up better than Blues and the default colours on my monitor (https://scipy-cookbook.readthedocs.io/items/Matplotlib_Show_colormaps.html)
ax1.set_title('Logistic Regression v1 Confusion Matrix')
sns.heatmap(confusion_matrix(y_test, y_pred_test_LR_v2), annot=True, fmt='d', ax=ax2, cmap='RdPu')
ax2.set_title('Logistic Regression v2 Confusion Matrix')
plt.show()

# XGB v1 confusion matrix vs XGB v2 confusion matrix
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_test_XGB), annot=True, fmt='d', ax=ax1, cmap='RdPu')
ax1.set_title('XGB v1 Confusion Matrix')
sns.heatmap(confusion_matrix(y_test, y_pred_test_XGB_v2), annot=True, fmt='d', ax=ax2, cmap='RdPu')
ax2.set_title('XGB v2 Confusion Matrix')
plt.show()

# SVM v1 confusion matrix vs SVM v2 confusion matrix
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_test_SVM), annot=True, fmt='d', ax=ax1, cmap='RdPu')
ax1.set_title('SVM v1 Confusion Matrix')
sns.heatmap(confusion_matrix(y_test, y_pred_test_SVM_v2), annot=True, fmt='d', ax=ax2, cmap='RdPu')
ax2.set_title('SVM v2 Confusion Matrix')
plt.show()

# LR v1 classification report vs LR v2 classification report
print('Logistic Regression v1 Classification Report')
print(classification_report(y_test, y_pred_test_LR))
print('Logistic Regression v2 Classification Report')
print(classification_report(y_test, y_pred_test_LR_v2))
print("="*50)
# XGB v1 classification report vs XGB v2 classification report
print('XGB v1 Classification Report')
print(classification_report(y_test, y_pred_test_XGB))
print('XGB v2 Classification Report')
print(classification_report(y_test, y_pred_test_XGB_v2))
print("="*50)
# SVM v1 classification report vs SVM v2 classification report
print('SVM v1 Classification Report')
print(classification_report(y_test, y_pred_test_SVM))
print('SVM v2 Classification Report')
print(classification_report(y_test, y_pred_test_SVM_v2))