Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
6006CEM_2021s1_8405405_MAW/6006CEM_code.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
165 lines (138 sloc)
9.89 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn import preprocessing, model_selection, neighbors, pipeline | |
from sklearn.preprocessing import PolynomialFeatures | |
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix | |
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
import time | |
import matplotlib.pyplot as plt | |
'''Cleaning the dataset''' | |
dataset = pd.read_csv('breast-cancer-wisconsin.data') #loading our dataset using pandas | |
dataset.replace('?', -99999, inplace=True) #replacing all the NaN values with -99999 so it's treated as an outlier | |
dataset.drop(['id'], 1, inplace=True) #dropping the id column as its useless and can mess up the accuracy | |
'''Assigning data to X and y''' | |
X = np.array(dataset.drop(['class'], 1)) #assigning all features columns except the class column to X | |
y = np.array(dataset['class']) #assigning the class column to y | |
print("X shape: ", X.shape) | |
print("y shape: ", y.shape) | |
'''splitting data into training and testing''' | |
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) #shuffling and splitting the data 80-20% | |
print("X_train shape: ", X_train.shape) | |
print("X_test shape: ", X_test.shape) | |
print("y_train shape: ", y_train.shape) | |
print("y_test shape: ", y_test.shape) | |
'''scaling data to compare the results''' | |
scaler = preprocessing.StandardScaler() #using standard scaler to scale the data and then assigning it to new X_scaled | |
X_scaled = scaler.fit_transform(X) | |
print(X_scaled) | |
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = model_selection.train_test_split(X_scaled, y, test_size=0.2) #splitting the scaled data into train and test | |
'''LOGISTIC REGRESSION''' | |
model_logistic = LogisticRegression(max_iter=100000) | |
model_logistic.fit(X_train, y_train) #fitting training data | |
acc_logistic = model_logistic.score(X_test, y_test) #testing the data | |
prediction_logistic = model_logistic.predict((X_test)) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
print("LOGISTIC REGRESSION\n") | |
print("\nClassification report: ", classification_report(y_test, prediction_logistic),'\n') #classification report for logistic regression | |
print("\nConfusion Matrix: ", confusion_matrix(y_test, prediction_logistic),'\n') #confusion matrix for logistic regression | |
plot_confusion_matrix(model_logistic, X_test, y_test) | |
print("\nAccuracy Logistic: ", acc_logistic) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''LOGISTIC REGRESSION SCALED''' | |
model_logistic.fit(X_train_scaled, y_train_scaled) | |
acc_scaled = model_logistic.score(X_test_scaled, y_test_scaled) | |
prediction_scaled = model_logistic.predict((X_test_scaled)) | |
print("SCALED LOGISTIC REGRESSION\n") | |
print("\nClassification report for scaled Data: ", classification_report(y_test_scaled, prediction_scaled), '\n') | |
print("\nConfusion Matrix: ", confusion_matrix(y_test_scaled, prediction_scaled),'\n') | |
print("\nAccuracy Scaled Logistic: ", acc_scaled) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''LOGISTIC REGRESSION POLYNOMIAL ''' | |
print("POLYNOMIAL REGRESSION\n") | |
for x in range(1, 4): | |
model_poly = pipeline.make_pipeline(PolynomialFeatures(degree=x),LogisticRegression()) #checking different polynomial degrees to see which one outputs best results | |
model_poly.fit(X_train_scaled, y_train_scaled) | |
acc_poly = model_poly.score(X_test_scaled, y_test_scaled) | |
prediction_poly = model_poly.predict((X_test_scaled)) | |
print("\nClassification report for Polynomial Data: ", classification_report(y_test_scaled, prediction_poly), '\n') | |
print("\nConfusion Matrix: ", confusion_matrix(y_test_scaled, prediction_poly), '\n') | |
print("\nAccuracy Polynomial Logistic Degree %.f: " % x, acc_poly ) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''GRID SEARCH FOR LOGISTIC REGRESSION''' | |
model_logistic_grid = LogisticRegression() | |
grid_logistic = dict(solver=['newton-cg', 'lbfgs', 'liblinear'], penalty=['l2'], C=[100, 10, 1.0, 0.1, 0.01]) #setting parameters for grid search | |
start_logistic_grid = time.time() #starting the timer for grid search to compare with random search | |
cv_logistic = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) | |
grid_logistic_search = GridSearchCV(estimator=model_logistic_grid, param_grid=grid_logistic, n_jobs=1, cv=cv_logistic, scoring='accuracy',error_score=0) #grid search with set parameters | |
grid_logistic_result = grid_logistic_search.fit(X_scaled, y) #fitting the data | |
print("GRID SEARCH LOGISTIC") | |
print("Best: %f using %s" % (grid_logistic_result.best_score_, grid_logistic_result.best_params_)) #printing out the best results | |
stop_logistic_grid = time.time() | |
print("\n{} seconds to run".format(round(stop_logistic_grid - start_logistic_grid, 3))) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''RANDOM SEARCH FOR LOGISTIC REGRESSION''' | |
model_logistic_random = LogisticRegression() | |
random_logistic = dict(solver=['newton-cg', 'lbfgs', 'liblinear'],penalty=['l2'],C=[100, 10, 1.0, 0.1, 0.01]) | |
start_logistic_random = time.time() | |
cv_random = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) | |
random_logistic_search = RandomizedSearchCV(estimator=model_logistic_random, param_distributions=random_logistic, n_jobs=1, cv=cv_random, scoring='accuracy',error_score=0) | |
random_logistic_result = random_logistic_search.fit(X_scaled, y) | |
print("RANDOM SEARCH LOGISTIC") | |
print("Best: %f using %s" % (random_logistic_result.best_score_, random_logistic_result.best_params_)) | |
stop_logistic_random = time.time() | |
print("\n{} seconds to run".format(round(stop_logistic_random - start_logistic_random, 3))) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''KNN ALGORITHM''' | |
model_knn = neighbors.KNeighborsClassifier(n_neighbors=5) #assigning knn as our classifier with 5 neighbors | |
model_knn.fit(X_train, y_train) #fitting the data | |
acc_knn = model_knn.score(X_test, y_test) #testing data | |
prediction_knn = model_knn.predict((X_test)) | |
print("K-NEAREST NEIGHBORS\n") | |
print("\nClassification report: ", classification_report(y_test, prediction_knn)) #Classification report for KNN | |
print("\nConfusion Matrix: ", confusion_matrix(y_test, prediction_knn),'\n') #Confusion Matrix for KNN | |
plot_confusion_matrix(model_knn, X_test, y_test) | |
plt.show() | |
print("Accuracy KNN: ", acc_knn) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''SCALED KNN ALGORITHM''' | |
model_knn_scaled = neighbors.KNeighborsClassifier(n_neighbors=5) | |
model_knn_scaled.fit(X_train_scaled, y_train_scaled) | |
acc_knn_scaled = model_knn_scaled.score(X_test_scaled, y_test_scaled) | |
prediction_knn_scaled = model_knn_scaled.predict((X_test_scaled)) | |
print("SCALED K-NEAREST NEIGHBORS\n") | |
print("\nClassification report: ", classification_report(y_test_scaled, prediction_knn_scaled)) | |
print("\nConfusion Matrix: ", confusion_matrix(y_test_scaled, prediction_knn_scaled),'\n') | |
print("Accuracy Scaled KNN: ", acc_knn_scaled) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''5 FOLD CROSS VALIDATION KNN''' | |
knn_cv =neighbors.KNeighborsClassifier(n_neighbors=3) | |
cv_scores = cross_val_score(knn_cv, X_scaled, y, cv=5) #using k fold cross validation to get average score | |
print("5 FOLD CROSS VALIDATION KNN\n") | |
print("5 Scores: {}".format(cv_scores), '\n') #printing all 5 scores | |
print("cv_scores mean: {}".format(np.mean(cv_scores))) #printing the average score | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''GRID SEARCH FOR KNN''' | |
model_knn_grid = neighbors.KNeighborsClassifier() | |
grid_knn = dict(n_neighbors=range(1, 19, 2),weights=['uniform', 'distance'],metric=['euclidean', 'manhattan', 'minkowski']) #setting parameters for grid search | |
start_knn_grid = time.time() #starting the timer to compare with random search- | |
cv_knn = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) | |
grid_knn_search = GridSearchCV(estimator=model_knn_grid, param_grid=grid_knn, n_jobs=-1, cv=cv_knn, scoring='accuracy', error_score=0) #setting grid search with parameters to test it | |
grid_knn_result = grid_knn_search.fit(X_scaled, y) #running grid search | |
print("GRID SEARCH KNN\n") | |
print("Best: %f using %s" % (grid_knn_result.best_score_, grid_knn_result.best_params_)) #printing the best score along with parameters | |
stop_knn_grid = time.time() | |
print("\n{} seconds to run".format(round(stop_knn_grid - start_knn_grid, 3))) | |
print("\n---------------------------------------------------------------------------------------------------\n") | |
'''RANDOM SEARCH FOR KNN''' | |
model_knn_random = neighbors.KNeighborsClassifier() | |
random_knn = dict(n_neighbors=range(1, 19, 2),weights=['uniform', 'distance'],metric=['euclidean', 'manhattan', 'minkowski']) | |
start_knn_random = time.time() | |
cv_knn_random = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) | |
random_knn_search = RandomizedSearchCV(estimator=model_knn_random, param_distributions=random_knn, cv=cv_knn_random, scoring='accuracy', error_score=0) | |
random_knn_result = random_knn_search.fit(X_scaled, y) | |
print("RANDOM SEARCH KNN\n") | |
print("Best: %f using %s" % (random_knn_result.best_score_, random_knn_result.best_params_)) | |
stop_knn_random = time.time() | |
print("\n{} seconds to run".format(round(stop_knn_random - start_knn_random, 3))) | |
print("\n---------------------------------------------------------------------------------------------------\n") |