From f48eafb059b3c6419e2f842c7801b018198752e8 Mon Sep 17 00:00:00 2001 From: Tereso del Rio Date: Thu, 14 Sep 2023 19:52:40 +0200 Subject: [PATCH] Added some essential files --- config/general_values.py | 3 + config/hyperparameters_grid.py | 70 ++++++++++++++ config/ml_models.py | 42 +++++++++ find_filename.py | 49 ++++++++++ preprocessing_Dorians_features.py | 46 ++++++++++ test_models.py | 148 ++++++++++++++++++++++++++++++ yaml_tools.py | 15 +++ 7 files changed, 373 insertions(+) create mode 100644 config/general_values.py create mode 100644 config/hyperparameters_grid.py create mode 100644 config/ml_models.py create mode 100644 find_filename.py create mode 100644 preprocessing_Dorians_features.py create mode 100644 test_models.py create mode 100644 yaml_tools.py diff --git a/config/general_values.py b/config/general_values.py new file mode 100644 index 0000000..46b5805 --- /dev/null +++ b/config/general_values.py @@ -0,0 +1,3 @@ + +purposes = ['Train', 'Test'] +dataset_qualities = ['Normal', 'Balanced', 'Augmented'] diff --git a/config/hyperparameters_grid.py b/config/hyperparameters_grid.py new file mode 100644 index 0000000..abbef04 --- /dev/null +++ b/config/hyperparameters_grid.py @@ -0,0 +1,70 @@ +"""Contains the grid of hyperparameters that each model will try""" + +grid = dict() +grid['RF'] = { + 'n_estimators': [200, 300, 400, 500], + 'max_features': ['sqrt', 'log2'], + 'max_depth': [4, 5, 6, 7, 8], + 'criterion': ['gini', 'entropy'] +} +grid['KNN'] = { + 'n_neighbors': [1,3,5,7,12], + 'weights': ['uniform', 'distance'], + 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], + #'leaf_size': range(1, 10, 3), + #'p': range(1, 4, 1) +} +grid['MLP'] = { + 'hidden_layer_sizes': [(5,5), (15,15), (20,20), (10,10,10), (20,20,20)], #[(i,i) for i in range(50, 20, 5)],# +[(i,i, i) for i in range(50, 20, 5)], + 'activation': ['tanh', 'relu'], + 'solver': ['sgd', 'adam'], + 'learning_rate': ['constant','adaptive'], + 'alpha': [0.05, 0.005], + 'max_iter': [1000] +} +grid['DT'] = { + 'criterion': ['gini', 'entropy'], + 'splitter': ['best', 'random'], + 'max_depth': [1,4,7,10,13,16,19] +} +grid['SVC'] = { + 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], + 'tol': [0.0316], + 'C': [5,100,200,300], + 'gamma': ['scale', 'auto'] +} + +grid['RFR'] = { + 'criterion': ['squared_error', 'friedman_mse'], + "max_depth": [1,3,7], + "min_samples_leaf": [1,5,10], +} +grid['KNNR'] = { + 'n_neighbors': [3, 5, 10], + 'weights': ['uniform', 'distance'], + 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] +} +grid['MLPR'] = { + 'hidden_layer_sizes': [(100,), (20, 20), (10, 10, 10)], + 'activation': ['logistic', 'tanh', 'relu'], + 'solver': ['adam', 'sgd'], + 'alpha': [0.0001, 0.001, 0.01] +} +grid['DTR'] = { + "splitter":["best","random"], + "max_depth" : [1,3,7,12], + "min_samples_leaf":[1,5,10], + # "min_weight_fraction_leaf":[0.1,0.5,0.9], + # "max_features":["auto","log2","sqrt",None], + # "max_leaf_nodes":[None,10,50,90] +} +grid['SVR'] = { + 'kernel': ('linear', 'rbf','poly'), + 'C':[1.5, 10], + 'gamma': [1e-7, 1e-4], + 'epsilon':[0.1,0.2,0.5,0.3] +} +grid['SGD'] = { + 'loss':["squared_error", "huber", "epsilon_insensitive"], + 'penalty':["l2", "l1", "elasticnet"] +} \ No newline at end of file diff --git a/config/ml_models.py b/config/ml_models.py new file mode 100644 index 0000000..8db0663 --- /dev/null +++ b/config/ml_models.py @@ -0,0 +1,42 @@ +"""Contains the ml models that will be used in the project""" + +from sklearn.svm import SVC +from sklearn.ensemble import RandomForestClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.neighbors import KNeighborsClassifier + +from sklearn.svm import SVR +from sklearn.ensemble import RandomForestRegressor +from sklearn.neural_network import MLPRegressor +from sklearn.tree import DecisionTreeRegressor +from sklearn.neighbors import KNeighborsRegressor + +ml_models = [ + 'KNN', + 'DT', + 'SVC', + 'RF', + 'MLP' + ] + +ml_regressors = [ + 'DTR', + 'SVR', + 'RFR', + 'KNNR', + 'MLPR' + ] + +sklearn_models = { + 'DT': DecisionTreeClassifier, + 'KNN': KNeighborsClassifier, + 'RF': RandomForestClassifier, + 'SVC': SVC, + 'MLP': MLPClassifier, + 'DTR': DecisionTreeRegressor, + 'KNNR': KNeighborsRegressor, + 'RFR': RandomForestRegressor, + 'SVR': SVR, + 'MLPR': MLPRegressor +} diff --git a/find_filename.py b/find_filename.py new file mode 100644 index 0000000..572e03a --- /dev/null +++ b/find_filename.py @@ -0,0 +1,49 @@ +import os +from config.general_values import dataset_qualities +from config.general_values import purposes + + +def find_hyperparams_filename(method, ml_model): + return os.path.join(os.path.dirname(__file__), + 'config', 'hyperparams', + f'{method}_{ml_model}') + + +def find_model_filename(method, ml_model): + return os.path.join(os.path.dirname(__file__), + 'config', 'models', + f'{method}_{ml_model}.txt') + + +def find_dataset_filename(purpose, method=None): + if purpose == "unclean": + return os.path.join(os.path.dirname(__file__), + 'DatasetsBeforeProcessing', + 'dataset_without_repetition_return_ncells.txt') + # 'dataset_with_repetition_return_ncells.txt') + # for returning "repeated" instances + # those with the same number of cells for all projections + elif purpose == "clean": + return os.path.join(os.path.dirname(__file__), + 'datasets', + 'clean_dataset.txt') + elif purpose == 'instances': + return os.path.join(os.path.dirname(__file__), + 'datasets', + 'dataset_instances.csv') + elif purpose in purposes: + return os.path.join(os.path.dirname(__file__), + 'datasets', f'{purpose}', + f'{method}_{purpose}_dataset.txt') + else: + raise Exception(f"Purpose {purpose} not found") + + +def find_output_filename(training_method): + return os.path.join(os.path.dirname(__file__), 'results', + f'ml_trained_in_{training_method}.csv') + + +def find_other_filename(search): + return os.path.join(os.path.dirname(__file__), 'config', + f'{search}.txt') diff --git a/preprocessing_Dorians_features.py b/preprocessing_Dorians_features.py new file mode 100644 index 0000000..9165571 --- /dev/null +++ b/preprocessing_Dorians_features.py @@ -0,0 +1,46 @@ +"""IS THIS BEING USED?""" +import numpy as np +from sklearn.preprocessing import normalize + + +def convert_to_numpy_floats(features): + return np.array([np.array([np.float64(feature) + for feature in feature_list]) + for feature_list in features]) + + +def normalize_features(features): + """ + Normalize each column of features. + + The new media is 0 and the standard deviation is 1 in each column. + """ + normal_features = [] + for feature in zip(*features): + mean = np.mean(feature) + std = np.std(feature) + if std != 0: + normal_features.append((feature - mean) / std) + else: + normal_features.append(feature - mean) + return normal_features + + +def normalize_features2(features): + """ + Normalize each column of features. + + The new media is 0 and the standard deviation is 1 in each column. + """ + return normalize(features, axis=0) + + + + +# v = convert_to_numpy_floats([[2,1,4,1,41],[3,1,142,12,1],[21,12,34,123,2]]) +# print(v[0,1]) + +# print(normalize_features2(v)==normalize_features(v)) + +# print(normalize_features2(v)) +# print(normalize_features(v)) \ No newline at end of file diff --git a/test_models.py b/test_models.py new file mode 100644 index 0000000..55de50d --- /dev/null +++ b/test_models.py @@ -0,0 +1,148 @@ +import csv +import pickle +import importlib.util +import numpy as np +from sklearn import metrics +from config.general_values import dataset_qualities +from config.ml_models import ml_models +from find_filename import find_output_filename +from find_filename import find_dataset_filename +from find_filename import find_model_filename +# Check if 'dataset_manipulation' is installed +if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): + from exploit_symmetries import give_all_symmetries +else: + from packages.dataset_manipulation.exploit_symmetries import give_all_symmetries + + +# def test_model(trained_model_filename, test_dataset_filename): +# with open(trained_model_filename, 'rb') as trained_model_file: +# model = pickle.load(trained_model_file) +# with open(test_dataset_filename, 'rb') as test_dataset_file: +# x_test, y_test, _ = pickle.load(test_dataset_file) +# y_pred = model.predict(x_test) +# return metrics.accuracy_score(y_test, y_pred) + + +def test_results(training_method): + output_filename = find_output_filename(training_method) + with open(output_filename, 'w') as output_file: + writer_balanced = csv.writer(output_file) + writer_balanced.writerow(["Name"] + dataset_qualities) + for ml_model in ml_models: + trained_model_filename = find_model_filename(training_method, + ml_model) + accuracy = dict() + for testing_method in dataset_qualities: + test_dataset_filename = find_dataset_filename('Test', + testing_method) + accuracy[testing_method] = test_model(trained_model_filename, + test_dataset_filename) + print('testing_method', testing_method) + print('ml_model', ml_model) + print('acc', accuracy[testing_method]) + round_accuracies = [round(acc, 2) + for acc in [accuracy[method] + for method in dataset_qualities]] + writer_balanced.writerow([ml_model + "-" + training_method] + + round_accuracies) + + +def test_classifier(ml_model, testing_method='augmented'): + trained_model_filename = find_model_filename('classification', + ml_model) + test_dataset_filename = find_dataset_filename('Test', + testing_method) + with open(trained_model_filename, 'rb') as trained_model_file: + model = pickle.load(trained_model_file) + with open(test_dataset_filename, 'rb') as test_dataset_file: + x_test, y_test, all_timings = pickle.load(test_dataset_file) + chosen_indices = [return_regressor_choice(model, features) for features in x_test] + return compute_metrics(chosen_indices, y_test, all_timings) + + +def timings_in_test(model, testing_method='augmented', training_method=None): + test_dataset_filename = find_dataset_filename('test', + testing_method) + with open(test_dataset_filename, 'rb') as test_dataset_file: + x_test, _, all_timings = pickle.load(test_dataset_file) + if model == 'optimal': + t_pred = [min(timings) for timings in all_timings] + else: + trained_model_filename = find_model_filename(training_method, + model) + with open(trained_model_filename, 'rb') as trained_model_file: + model = pickle.load(trained_model_file) + y_pred = model.predict(x_test) + # This doesn't work because agumenteed and balanced + # only return one timing, not 6 + t_pred = [timings[y] for timings, y in zip(all_timings, y_pred)] + return t_pred + + +def test_regressor(ml_model): + trained_model_filename = find_model_filename('regression', + ml_model) + test_dataset_filename = find_dataset_filename('test', + 'regression') + with open(trained_model_filename, 'rb') as trained_model_file: + model = pickle.load(trained_model_file) + with open(test_dataset_filename, 'rb') as test_dataset_file: + x_test, y_test, all_timings = pickle.load(test_dataset_file) + y_pred = model.predict(x_test) + avg_error = sum([abs(p-t) for p, t in zip(y_pred, y_test)])/len(y_pred) + print(f"{ml_model} gave {avg_error}") + + +def test_model(ml_model, paradigm, testing_method='augmented'): + trained_model_filename = find_model_filename(paradigm, + ml_model) + print(trained_model_filename, paradigm, ml_model) + test_dataset_filename = find_dataset_filename('Test', + testing_method) + with open(trained_model_filename, 'rb') as trained_model_file: + model = pickle.load(trained_model_file) + with open(test_dataset_filename, 'rb') as test_dataset_file: + testing_dataset = pickle.load(test_dataset_file) + chosen_indices = [return_regressor_choice(model, features) + for features in testing_dataset['features']] + return compute_metrics(chosen_indices, + testing_dataset['labels'], + testing_dataset['timings'], + testing_dataset['cells']) + + +def compute_metrics(chosen_indices, labels, all_timings, all_cells): + metrics = dict() + correct = 0 + metrics['Total time'] = 0 + total_markup = 0 + metrics['Completed'] = 0 + metrics['Total cells'] = 0 + for chosen_index, label, timings, cells in \ + zip(chosen_indices, labels, all_timings, all_cells): + if chosen_index == label: + correct += 1 + if timings[chosen_index] not in [30, 60]: + metrics['Completed'] += 1 + metrics['Total time'] += timings[chosen_index] + total_markup += (timings[chosen_index]-timings[label])/(timings[label] + 1) + metrics['Total cells'] += cells[chosen_index] + total_instances = len(chosen_indices) + metrics['Accuracy'] = correct/total_instances + metrics['Markup'] = total_markup/total_instances + return metrics + + +def return_regressor_choice(model, features): + features_all_symmetries = give_all_symmetries(features) + y_op = float('inf') + for index, x_features in enumerate(features_all_symmetries): + # print(x_features) + y_pred = model.predict([x_features]) + # print(y_pred) + if y_op > y_pred: + y_op = y_pred + index_op = index + # print(index_op) + return index_op diff --git a/yaml_tools.py b/yaml_tools.py new file mode 100644 index 0000000..2af2972 --- /dev/null +++ b/yaml_tools.py @@ -0,0 +1,15 @@ +import yaml + + +def write_yaml_to_file(py_obj, filename): + with open(f'{filename}.yaml', 'w',) as f: + yaml.dump(py_obj, f, sort_keys=False) + print('Written to file successfully') + + +def read_yaml_from_file(filename): + with open(f'{filename}.yaml') as f: + # py_obj = yaml.safe_load(f) + py_obj = yaml.load(f, Loader=yaml.Loader) + print('Read from file successfully') + return py_obj