Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
from sklearn import preprocessing, model_selection, neighbors, pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV
import pandas as pd
from sklearn.linear_model import LogisticRegression
import time
import matplotlib.pyplot as plt
'''Cleaning the dataset'''
dataset = pd.read_csv('breast-cancer-wisconsin.data') #loading our dataset using pandas
dataset.replace('?', -99999, inplace=True) #replacing all the NaN values with -99999 so it's treated as an outlier
dataset.drop(['id'], 1, inplace=True) #dropping the id column as its useless and can mess up the accuracy
'''Assigning data to X and y'''
X = np.array(dataset.drop(['class'], 1)) #assigning all features columns except the class column to X
y = np.array(dataset['class']) #assigning the class column to y
print("X shape: ", X.shape)
print("y shape: ", y.shape)
'''splitting data into training and testing'''
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) #shuffling and splitting the data 80-20%
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)
'''scaling data to compare the results'''
scaler = preprocessing.StandardScaler() #using standard scaler to scale the data and then assigning it to new X_scaled
X_scaled = scaler.fit_transform(X)
print(X_scaled)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = model_selection.train_test_split(X_scaled, y, test_size=0.2) #splitting the scaled data into train and test
'''LOGISTIC REGRESSION'''
model_logistic = LogisticRegression(max_iter=100000)
model_logistic.fit(X_train, y_train) #fitting training data
acc_logistic = model_logistic.score(X_test, y_test) #testing the data
prediction_logistic = model_logistic.predict((X_test))
print("\n---------------------------------------------------------------------------------------------------\n")
print("LOGISTIC REGRESSION\n")
print("\nClassification report: ", classification_report(y_test, prediction_logistic),'\n') #classification report for logistic regression
print("\nConfusion Matrix: ", confusion_matrix(y_test, prediction_logistic),'\n') #confusion matrix for logistic regression
plot_confusion_matrix(model_logistic, X_test, y_test)
print("\nAccuracy Logistic: ", acc_logistic)
print("\n---------------------------------------------------------------------------------------------------\n")
'''LOGISTIC REGRESSION SCALED'''
model_logistic.fit(X_train_scaled, y_train_scaled)
acc_scaled = model_logistic.score(X_test_scaled, y_test_scaled)
prediction_scaled = model_logistic.predict((X_test_scaled))
print("SCALED LOGISTIC REGRESSION\n")
print("\nClassification report for scaled Data: ", classification_report(y_test_scaled, prediction_scaled), '\n')
print("\nConfusion Matrix: ", confusion_matrix(y_test_scaled, prediction_scaled),'\n')
print("\nAccuracy Scaled Logistic: ", acc_scaled)
print("\n---------------------------------------------------------------------------------------------------\n")
'''LOGISTIC REGRESSION POLYNOMIAL '''
print("POLYNOMIAL REGRESSION\n")
for x in range(1, 4):
model_poly = pipeline.make_pipeline(PolynomialFeatures(degree=x),LogisticRegression()) #checking different polynomial degrees to see which one outputs best results
model_poly.fit(X_train_scaled, y_train_scaled)
acc_poly = model_poly.score(X_test_scaled, y_test_scaled)
prediction_poly = model_poly.predict((X_test_scaled))
print("\nClassification report for Polynomial Data: ", classification_report(y_test_scaled, prediction_poly), '\n')
print("\nConfusion Matrix: ", confusion_matrix(y_test_scaled, prediction_poly), '\n')
print("\nAccuracy Polynomial Logistic Degree %.f: " % x, acc_poly )
print("\n---------------------------------------------------------------------------------------------------\n")
'''GRID SEARCH FOR LOGISTIC REGRESSION'''
model_logistic_grid = LogisticRegression()
grid_logistic = dict(solver=['newton-cg', 'lbfgs', 'liblinear'], penalty=['l2'], C=[100, 10, 1.0, 0.1, 0.01]) #setting parameters for grid search
start_logistic_grid = time.time() #starting the timer for grid search to compare with random search
cv_logistic = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_logistic_search = GridSearchCV(estimator=model_logistic_grid, param_grid=grid_logistic, n_jobs=1, cv=cv_logistic, scoring='accuracy',error_score=0) #grid search with set parameters
grid_logistic_result = grid_logistic_search.fit(X_scaled, y) #fitting the data
print("GRID SEARCH LOGISTIC")
print("Best: %f using %s" % (grid_logistic_result.best_score_, grid_logistic_result.best_params_)) #printing out the best results
stop_logistic_grid = time.time()
print("\n{} seconds to run".format(round(stop_logistic_grid - start_logistic_grid, 3)))
print("\n---------------------------------------------------------------------------------------------------\n")
'''RANDOM SEARCH FOR LOGISTIC REGRESSION'''
model_logistic_random = LogisticRegression()
random_logistic = dict(solver=['newton-cg', 'lbfgs', 'liblinear'],penalty=['l2'],C=[100, 10, 1.0, 0.1, 0.01])
start_logistic_random = time.time()
cv_random = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
random_logistic_search = RandomizedSearchCV(estimator=model_logistic_random, param_distributions=random_logistic, n_jobs=1, cv=cv_random, scoring='accuracy',error_score=0)
random_logistic_result = random_logistic_search.fit(X_scaled, y)
print("RANDOM SEARCH LOGISTIC")
print("Best: %f using %s" % (random_logistic_result.best_score_, random_logistic_result.best_params_))
stop_logistic_random = time.time()
print("\n{} seconds to run".format(round(stop_logistic_random - start_logistic_random, 3)))
print("\n---------------------------------------------------------------------------------------------------\n")
'''KNN ALGORITHM'''
model_knn = neighbors.KNeighborsClassifier(n_neighbors=5) #assigning knn as our classifier with 5 neighbors
model_knn.fit(X_train, y_train) #fitting the data
acc_knn = model_knn.score(X_test, y_test) #testing data
prediction_knn = model_knn.predict((X_test))
print("K-NEAREST NEIGHBORS\n")
print("\nClassification report: ", classification_report(y_test, prediction_knn)) #Classification report for KNN
print("\nConfusion Matrix: ", confusion_matrix(y_test, prediction_knn),'\n') #Confusion Matrix for KNN
plot_confusion_matrix(model_knn, X_test, y_test)
plt.show()
print("Accuracy KNN: ", acc_knn)
print("\n---------------------------------------------------------------------------------------------------\n")
'''SCALED KNN ALGORITHM'''
model_knn_scaled = neighbors.KNeighborsClassifier(n_neighbors=5)
model_knn_scaled.fit(X_train_scaled, y_train_scaled)
acc_knn_scaled = model_knn_scaled.score(X_test_scaled, y_test_scaled)
prediction_knn_scaled = model_knn_scaled.predict((X_test_scaled))
print("SCALED K-NEAREST NEIGHBORS\n")
print("\nClassification report: ", classification_report(y_test_scaled, prediction_knn_scaled))
print("\nConfusion Matrix: ", confusion_matrix(y_test_scaled, prediction_knn_scaled),'\n')
print("Accuracy Scaled KNN: ", acc_knn_scaled)
print("\n---------------------------------------------------------------------------------------------------\n")
'''5 FOLD CROSS VALIDATION KNN'''
knn_cv =neighbors.KNeighborsClassifier(n_neighbors=3)
cv_scores = cross_val_score(knn_cv, X_scaled, y, cv=5) #using k fold cross validation to get average score
print("5 FOLD CROSS VALIDATION KNN\n")
print("5 Scores: {}".format(cv_scores), '\n') #printing all 5 scores
print("cv_scores mean: {}".format(np.mean(cv_scores))) #printing the average score
print("\n---------------------------------------------------------------------------------------------------\n")
'''GRID SEARCH FOR KNN'''
model_knn_grid = neighbors.KNeighborsClassifier()
grid_knn = dict(n_neighbors=range(1, 19, 2),weights=['uniform', 'distance'],metric=['euclidean', 'manhattan', 'minkowski']) #setting parameters for grid search
start_knn_grid = time.time() #starting the timer to compare with random search-
cv_knn = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_knn_search = GridSearchCV(estimator=model_knn_grid, param_grid=grid_knn, n_jobs=-1, cv=cv_knn, scoring='accuracy', error_score=0) #setting grid search with parameters to test it
grid_knn_result = grid_knn_search.fit(X_scaled, y) #running grid search
print("GRID SEARCH KNN\n")
print("Best: %f using %s" % (grid_knn_result.best_score_, grid_knn_result.best_params_)) #printing the best score along with parameters
stop_knn_grid = time.time()
print("\n{} seconds to run".format(round(stop_knn_grid - start_knn_grid, 3)))
print("\n---------------------------------------------------------------------------------------------------\n")
'''RANDOM SEARCH FOR KNN'''
model_knn_random = neighbors.KNeighborsClassifier()
random_knn = dict(n_neighbors=range(1, 19, 2),weights=['uniform', 'distance'],metric=['euclidean', 'manhattan', 'minkowski'])
start_knn_random = time.time()
cv_knn_random = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
random_knn_search = RandomizedSearchCV(estimator=model_knn_random, param_distributions=random_knn, cv=cv_knn_random, scoring='accuracy', error_score=0)
random_knn_result = random_knn_search.fit(X_scaled, y)
print("RANDOM SEARCH KNN\n")
print("Best: %f using %s" % (random_knn_result.best_score_, random_knn_result.best_params_))
stop_knn_random = time.time()
print("\n{} seconds to run".format(round(stop_knn_random - start_knn_random, 3)))
print("\n---------------------------------------------------------------------------------------------------\n")