-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Tereso del Rio
committed
Sep 14, 2023
1 parent
94aaa66
commit f48eafb
Showing
7 changed files
with
373 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
|
||
purposes = ['Train', 'Test'] | ||
dataset_qualities = ['Normal', 'Balanced', 'Augmented'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
"""Contains the grid of hyperparameters that each model will try""" | ||
|
||
grid = dict() | ||
grid['RF'] = { | ||
'n_estimators': [200, 300, 400, 500], | ||
'max_features': ['sqrt', 'log2'], | ||
'max_depth': [4, 5, 6, 7, 8], | ||
'criterion': ['gini', 'entropy'] | ||
} | ||
grid['KNN'] = { | ||
'n_neighbors': [1,3,5,7,12], | ||
'weights': ['uniform', 'distance'], | ||
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], | ||
#'leaf_size': range(1, 10, 3), | ||
#'p': range(1, 4, 1) | ||
} | ||
grid['MLP'] = { | ||
'hidden_layer_sizes': [(5,5), (15,15), (20,20), (10,10,10), (20,20,20)], #[(i,i) for i in range(50, 20, 5)],# +[(i,i, i) for i in range(50, 20, 5)], | ||
'activation': ['tanh', 'relu'], | ||
'solver': ['sgd', 'adam'], | ||
'learning_rate': ['constant','adaptive'], | ||
'alpha': [0.05, 0.005], | ||
'max_iter': [1000] | ||
} | ||
grid['DT'] = { | ||
'criterion': ['gini', 'entropy'], | ||
'splitter': ['best', 'random'], | ||
'max_depth': [1,4,7,10,13,16,19] | ||
} | ||
grid['SVC'] = { | ||
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], | ||
'tol': [0.0316], | ||
'C': [5,100,200,300], | ||
'gamma': ['scale', 'auto'] | ||
} | ||
|
||
grid['RFR'] = { | ||
'criterion': ['squared_error', 'friedman_mse'], | ||
"max_depth": [1,3,7], | ||
"min_samples_leaf": [1,5,10], | ||
} | ||
grid['KNNR'] = { | ||
'n_neighbors': [3, 5, 10], | ||
'weights': ['uniform', 'distance'], | ||
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] | ||
} | ||
grid['MLPR'] = { | ||
'hidden_layer_sizes': [(100,), (20, 20), (10, 10, 10)], | ||
'activation': ['logistic', 'tanh', 'relu'], | ||
'solver': ['adam', 'sgd'], | ||
'alpha': [0.0001, 0.001, 0.01] | ||
} | ||
grid['DTR'] = { | ||
"splitter":["best","random"], | ||
"max_depth" : [1,3,7,12], | ||
"min_samples_leaf":[1,5,10], | ||
# "min_weight_fraction_leaf":[0.1,0.5,0.9], | ||
# "max_features":["auto","log2","sqrt",None], | ||
# "max_leaf_nodes":[None,10,50,90] | ||
} | ||
grid['SVR'] = { | ||
'kernel': ('linear', 'rbf','poly'), | ||
'C':[1.5, 10], | ||
'gamma': [1e-7, 1e-4], | ||
'epsilon':[0.1,0.2,0.5,0.3] | ||
} | ||
grid['SGD'] = { | ||
'loss':["squared_error", "huber", "epsilon_insensitive"], | ||
'penalty':["l2", "l1", "elasticnet"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
"""Contains the ml models that will be used in the project""" | ||
|
||
from sklearn.svm import SVC | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.neural_network import MLPClassifier | ||
from sklearn.tree import DecisionTreeClassifier | ||
from sklearn.neighbors import KNeighborsClassifier | ||
|
||
from sklearn.svm import SVR | ||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.neural_network import MLPRegressor | ||
from sklearn.tree import DecisionTreeRegressor | ||
from sklearn.neighbors import KNeighborsRegressor | ||
|
||
ml_models = [ | ||
'KNN', | ||
'DT', | ||
'SVC', | ||
'RF', | ||
'MLP' | ||
] | ||
|
||
ml_regressors = [ | ||
'DTR', | ||
'SVR', | ||
'RFR', | ||
'KNNR', | ||
'MLPR' | ||
] | ||
|
||
sklearn_models = { | ||
'DT': DecisionTreeClassifier, | ||
'KNN': KNeighborsClassifier, | ||
'RF': RandomForestClassifier, | ||
'SVC': SVC, | ||
'MLP': MLPClassifier, | ||
'DTR': DecisionTreeRegressor, | ||
'KNNR': KNeighborsRegressor, | ||
'RFR': RandomForestRegressor, | ||
'SVR': SVR, | ||
'MLPR': MLPRegressor | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import os | ||
from config.general_values import dataset_qualities | ||
from config.general_values import purposes | ||
|
||
|
||
def find_hyperparams_filename(method, ml_model): | ||
return os.path.join(os.path.dirname(__file__), | ||
'config', 'hyperparams', | ||
f'{method}_{ml_model}') | ||
|
||
|
||
def find_model_filename(method, ml_model): | ||
return os.path.join(os.path.dirname(__file__), | ||
'config', 'models', | ||
f'{method}_{ml_model}.txt') | ||
|
||
|
||
def find_dataset_filename(purpose, method=None): | ||
if purpose == "unclean": | ||
return os.path.join(os.path.dirname(__file__), | ||
'DatasetsBeforeProcessing', | ||
'dataset_without_repetition_return_ncells.txt') | ||
# 'dataset_with_repetition_return_ncells.txt') | ||
# for returning "repeated" instances | ||
# those with the same number of cells for all projections | ||
elif purpose == "clean": | ||
return os.path.join(os.path.dirname(__file__), | ||
'datasets', | ||
'clean_dataset.txt') | ||
elif purpose == 'instances': | ||
return os.path.join(os.path.dirname(__file__), | ||
'datasets', | ||
'dataset_instances.csv') | ||
elif purpose in purposes: | ||
return os.path.join(os.path.dirname(__file__), | ||
'datasets', f'{purpose}', | ||
f'{method}_{purpose}_dataset.txt') | ||
else: | ||
raise Exception(f"Purpose {purpose} not found") | ||
|
||
|
||
def find_output_filename(training_method): | ||
return os.path.join(os.path.dirname(__file__), 'results', | ||
f'ml_trained_in_{training_method}.csv') | ||
|
||
|
||
def find_other_filename(search): | ||
return os.path.join(os.path.dirname(__file__), 'config', | ||
f'{search}.txt') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
"""IS THIS BEING USED?""" | ||
import numpy as np | ||
from sklearn.preprocessing import normalize | ||
|
||
|
||
def convert_to_numpy_floats(features): | ||
return np.array([np.array([np.float64(feature) | ||
for feature in feature_list]) | ||
for feature_list in features]) | ||
|
||
|
||
def normalize_features(features): | ||
""" | ||
Normalize each column of features. | ||
|
||
The new media is 0 and the standard deviation is 1 in each column. | ||
""" | ||
normal_features = [] | ||
for feature in zip(*features): | ||
mean = np.mean(feature) | ||
std = np.std(feature) | ||
if std != 0: | ||
normal_features.append((feature - mean) / std) | ||
else: | ||
normal_features.append(feature - mean) | ||
return normal_features | ||
|
||
|
||
def normalize_features2(features): | ||
""" | ||
Normalize each column of features. | ||
|
||
The new media is 0 and the standard deviation is 1 in each column. | ||
""" | ||
return normalize(features, axis=0) | ||
|
||
|
||
|
||
|
||
# v = convert_to_numpy_floats([[2,1,4,1,41],[3,1,142,12,1],[21,12,34,123,2]]) | ||
# print(v[0,1]) | ||
|
||
# print(normalize_features2(v)==normalize_features(v)) | ||
|
||
# print(normalize_features2(v)) | ||
# print(normalize_features(v)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import csv | ||
import pickle | ||
import importlib.util | ||
import numpy as np | ||
from sklearn import metrics | ||
from config.general_values import dataset_qualities | ||
from config.ml_models import ml_models | ||
from find_filename import find_output_filename | ||
from find_filename import find_dataset_filename | ||
from find_filename import find_model_filename | ||
# Check if 'dataset_manipulation' is installed | ||
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): | ||
from exploit_symmetries import give_all_symmetries | ||
else: | ||
from packages.dataset_manipulation.exploit_symmetries import give_all_symmetries | ||
|
||
|
||
# def test_model(trained_model_filename, test_dataset_filename): | ||
# with open(trained_model_filename, 'rb') as trained_model_file: | ||
# model = pickle.load(trained_model_file) | ||
# with open(test_dataset_filename, 'rb') as test_dataset_file: | ||
# x_test, y_test, _ = pickle.load(test_dataset_file) | ||
# y_pred = model.predict(x_test) | ||
# return metrics.accuracy_score(y_test, y_pred) | ||
|
||
|
||
def test_results(training_method): | ||
output_filename = find_output_filename(training_method) | ||
with open(output_filename, 'w') as output_file: | ||
writer_balanced = csv.writer(output_file) | ||
writer_balanced.writerow(["Name"] + dataset_qualities) | ||
for ml_model in ml_models: | ||
trained_model_filename = find_model_filename(training_method, | ||
ml_model) | ||
accuracy = dict() | ||
for testing_method in dataset_qualities: | ||
test_dataset_filename = find_dataset_filename('Test', | ||
testing_method) | ||
accuracy[testing_method] = test_model(trained_model_filename, | ||
test_dataset_filename) | ||
print('testing_method', testing_method) | ||
print('ml_model', ml_model) | ||
print('acc', accuracy[testing_method]) | ||
round_accuracies = [round(acc, 2) | ||
for acc in [accuracy[method] | ||
for method in dataset_qualities]] | ||
writer_balanced.writerow([ml_model + "-" + training_method] + | ||
round_accuracies) | ||
|
||
|
||
def test_classifier(ml_model, testing_method='augmented'): | ||
trained_model_filename = find_model_filename('classification', | ||
ml_model) | ||
test_dataset_filename = find_dataset_filename('Test', | ||
testing_method) | ||
with open(trained_model_filename, 'rb') as trained_model_file: | ||
model = pickle.load(trained_model_file) | ||
with open(test_dataset_filename, 'rb') as test_dataset_file: | ||
x_test, y_test, all_timings = pickle.load(test_dataset_file) | ||
chosen_indices = [return_regressor_choice(model, features) for features in x_test] | ||
return compute_metrics(chosen_indices, y_test, all_timings) | ||
|
||
|
||
def timings_in_test(model, testing_method='augmented', training_method=None): | ||
test_dataset_filename = find_dataset_filename('test', | ||
testing_method) | ||
with open(test_dataset_filename, 'rb') as test_dataset_file: | ||
x_test, _, all_timings = pickle.load(test_dataset_file) | ||
if model == 'optimal': | ||
t_pred = [min(timings) for timings in all_timings] | ||
else: | ||
trained_model_filename = find_model_filename(training_method, | ||
model) | ||
with open(trained_model_filename, 'rb') as trained_model_file: | ||
model = pickle.load(trained_model_file) | ||
y_pred = model.predict(x_test) | ||
# This doesn't work because agumenteed and balanced | ||
# only return one timing, not 6 | ||
t_pred = [timings[y] for timings, y in zip(all_timings, y_pred)] | ||
return t_pred | ||
|
||
|
||
def test_regressor(ml_model): | ||
trained_model_filename = find_model_filename('regression', | ||
ml_model) | ||
test_dataset_filename = find_dataset_filename('test', | ||
'regression') | ||
with open(trained_model_filename, 'rb') as trained_model_file: | ||
model = pickle.load(trained_model_file) | ||
with open(test_dataset_filename, 'rb') as test_dataset_file: | ||
x_test, y_test, all_timings = pickle.load(test_dataset_file) | ||
y_pred = model.predict(x_test) | ||
avg_error = sum([abs(p-t) for p, t in zip(y_pred, y_test)])/len(y_pred) | ||
print(f"{ml_model} gave {avg_error}") | ||
|
||
|
||
def test_model(ml_model, paradigm, testing_method='augmented'): | ||
trained_model_filename = find_model_filename(paradigm, | ||
ml_model) | ||
print(trained_model_filename, paradigm, ml_model) | ||
test_dataset_filename = find_dataset_filename('Test', | ||
testing_method) | ||
with open(trained_model_filename, 'rb') as trained_model_file: | ||
model = pickle.load(trained_model_file) | ||
with open(test_dataset_filename, 'rb') as test_dataset_file: | ||
testing_dataset = pickle.load(test_dataset_file) | ||
chosen_indices = [return_regressor_choice(model, features) | ||
for features in testing_dataset['features']] | ||
return compute_metrics(chosen_indices, | ||
testing_dataset['labels'], | ||
testing_dataset['timings'], | ||
testing_dataset['cells']) | ||
|
||
|
||
def compute_metrics(chosen_indices, labels, all_timings, all_cells): | ||
metrics = dict() | ||
correct = 0 | ||
metrics['Total time'] = 0 | ||
total_markup = 0 | ||
metrics['Completed'] = 0 | ||
metrics['Total cells'] = 0 | ||
for chosen_index, label, timings, cells in \ | ||
zip(chosen_indices, labels, all_timings, all_cells): | ||
if chosen_index == label: | ||
correct += 1 | ||
if timings[chosen_index] not in [30, 60]: | ||
metrics['Completed'] += 1 | ||
metrics['Total time'] += timings[chosen_index] | ||
total_markup += (timings[chosen_index]-timings[label])/(timings[label] + 1) | ||
metrics['Total cells'] += cells[chosen_index] | ||
total_instances = len(chosen_indices) | ||
metrics['Accuracy'] = correct/total_instances | ||
metrics['Markup'] = total_markup/total_instances | ||
return metrics | ||
|
||
|
||
def return_regressor_choice(model, features): | ||
features_all_symmetries = give_all_symmetries(features) | ||
y_op = float('inf') | ||
for index, x_features in enumerate(features_all_symmetries): | ||
# print(x_features) | ||
y_pred = model.predict([x_features]) | ||
# print(y_pred) | ||
if y_op > y_pred: | ||
y_op = y_pred | ||
index_op = index | ||
# print(index_op) | ||
return index_op |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import yaml | ||
|
||
|
||
def write_yaml_to_file(py_obj, filename): | ||
with open(f'{filename}.yaml', 'w',) as f: | ||
yaml.dump(py_obj, f, sort_keys=False) | ||
print('Written to file successfully') | ||
|
||
|
||
def read_yaml_from_file(filename): | ||
with open(f'{filename}.yaml') as f: | ||
# py_obj = yaml.safe_load(f) | ||
py_obj = yaml.load(f, Loader=yaml.Loader) | ||
print('Read from file successfully') | ||
return py_obj |