From f48eafb059b3c6419e2f842c7801b018198752e8 Mon Sep 17 00:00:00 2001
From: Tereso del Rio <teresodra@gmail.com>
Date: Thu, 14 Sep 2023 19:52:40 +0200
Subject: [PATCH] Added some essential files

---
 config/general_values.py          |   3 +
 config/hyperparameters_grid.py    |  70 ++++++++++++++
 config/ml_models.py               |  42 +++++++++
 find_filename.py                  |  49 ++++++++++
 preprocessing_Dorians_features.py |  46 ++++++++++
 test_models.py                    | 148 ++++++++++++++++++++++++++++++
 yaml_tools.py                     |  15 +++
 7 files changed, 373 insertions(+)
 create mode 100644 config/general_values.py
 create mode 100644 config/hyperparameters_grid.py
 create mode 100644 config/ml_models.py
 create mode 100644 find_filename.py
 create mode 100644 preprocessing_Dorians_features.py
 create mode 100644 test_models.py
 create mode 100644 yaml_tools.py

diff --git a/config/general_values.py b/config/general_values.py
new file mode 100644
index 0000000..46b5805
--- /dev/null
+++ b/config/general_values.py
@@ -0,0 +1,3 @@
+
+purposes = ['Train', 'Test']
+dataset_qualities = ['Normal', 'Balanced', 'Augmented']
diff --git a/config/hyperparameters_grid.py b/config/hyperparameters_grid.py
new file mode 100644
index 0000000..abbef04
--- /dev/null
+++ b/config/hyperparameters_grid.py
@@ -0,0 +1,70 @@
+"""Contains the grid of hyperparameters that each model will try"""
+
+grid = dict()
+grid['RF'] = {
+    'n_estimators': [200, 300, 400, 500],
+    'max_features': ['sqrt', 'log2'],
+    'max_depth': [4, 5, 6, 7, 8],
+    'criterion': ['gini', 'entropy']
+}
+grid['KNN'] = {
+    'n_neighbors': [1,3,5,7,12],
+    'weights': ['uniform', 'distance'],
+    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
+    #'leaf_size': range(1, 10, 3),
+    #'p': range(1, 4, 1)
+}
+grid['MLP'] = {
+    'hidden_layer_sizes': [(5,5), (15,15), (20,20), (10,10,10), (20,20,20)], #[(i,i) for i in range(50, 20, 5)],# +[(i,i, i) for i in range(50, 20, 5)],
+    'activation': ['tanh', 'relu'],
+    'solver': ['sgd', 'adam'],
+    'learning_rate': ['constant','adaptive'],
+    'alpha': [0.05, 0.005],
+    'max_iter': [1000]
+}
+grid['DT'] = {
+    'criterion': ['gini', 'entropy'],
+    'splitter': ['best', 'random'],
+    'max_depth': [1,4,7,10,13,16,19]
+}
+grid['SVC'] = {
+    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
+    'tol': [0.0316],
+    'C': [5,100,200,300],
+    'gamma': ['scale', 'auto']
+}
+
+grid['RFR'] = {
+    'criterion': ['squared_error', 'friedman_mse'],
+    "max_depth": [1,3,7],
+    "min_samples_leaf": [1,5,10],
+}
+grid['KNNR'] = {
+    'n_neighbors': [3, 5, 10],
+    'weights': ['uniform', 'distance'],
+    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
+}
+grid['MLPR'] = {
+    'hidden_layer_sizes': [(100,), (20, 20), (10, 10, 10)],
+    'activation': ['logistic', 'tanh', 'relu'],
+    'solver': ['adam', 'sgd'],
+    'alpha': [0.0001, 0.001, 0.01]
+}
+grid['DTR'] = {
+    "splitter":["best","random"],
+    "max_depth" : [1,3,7,12],
+    "min_samples_leaf":[1,5,10],
+    # "min_weight_fraction_leaf":[0.1,0.5,0.9],
+    # "max_features":["auto","log2","sqrt",None],
+    # "max_leaf_nodes":[None,10,50,90]
+}
+grid['SVR'] = {
+    'kernel': ('linear', 'rbf','poly'),
+    'C':[1.5, 10],
+    'gamma': [1e-7, 1e-4],
+    'epsilon':[0.1,0.2,0.5,0.3]
+}
+grid['SGD'] = {
+    'loss':["squared_error", "huber", "epsilon_insensitive"],
+    'penalty':["l2", "l1", "elasticnet"]
+}
\ No newline at end of file
diff --git a/config/ml_models.py b/config/ml_models.py
new file mode 100644
index 0000000..8db0663
--- /dev/null
+++ b/config/ml_models.py
@@ -0,0 +1,42 @@
+"""Contains the ml models that will be used in the project"""
+
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.neighbors import KNeighborsClassifier
+
+from sklearn.svm import SVR
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neural_network import MLPRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.neighbors import KNeighborsRegressor
+
+ml_models = [
+             'KNN',
+             'DT',
+             'SVC',
+             'RF',
+             'MLP'
+             ]
+
+ml_regressors = [
+                 'DTR',
+                 'SVR',
+                 'RFR',
+                 'KNNR',
+                 'MLPR'
+                 ]
+
+sklearn_models = {
+    'DT': DecisionTreeClassifier,
+    'KNN': KNeighborsClassifier,
+    'RF': RandomForestClassifier,
+    'SVC': SVC,
+    'MLP': MLPClassifier,
+    'DTR': DecisionTreeRegressor,
+    'KNNR': KNeighborsRegressor,
+    'RFR': RandomForestRegressor,
+    'SVR': SVR,
+    'MLPR': MLPRegressor
+}
diff --git a/find_filename.py b/find_filename.py
new file mode 100644
index 0000000..572e03a
--- /dev/null
+++ b/find_filename.py
@@ -0,0 +1,49 @@
+import os
+from config.general_values import dataset_qualities
+from config.general_values import purposes
+
+
+def find_hyperparams_filename(method, ml_model):
+    return os.path.join(os.path.dirname(__file__),
+                        'config', 'hyperparams',
+                        f'{method}_{ml_model}')
+
+
+def find_model_filename(method, ml_model):
+    return os.path.join(os.path.dirname(__file__),
+                        'config', 'models',
+                        f'{method}_{ml_model}.txt')
+
+
+def find_dataset_filename(purpose, method=None):
+    if purpose == "unclean":
+        return os.path.join(os.path.dirname(__file__),
+                            'DatasetsBeforeProcessing',
+                            'dataset_without_repetition_return_ncells.txt')
+                            # 'dataset_with_repetition_return_ncells.txt')
+                            # for returning "repeated" instances
+                            # those with the same number of cells for all projections
+    elif purpose == "clean":
+        return os.path.join(os.path.dirname(__file__),
+                            'datasets',
+                            'clean_dataset.txt')
+    elif purpose == 'instances':
+        return os.path.join(os.path.dirname(__file__),
+                            'datasets',
+                            'dataset_instances.csv')
+    elif purpose in purposes:
+        return os.path.join(os.path.dirname(__file__),
+                            'datasets', f'{purpose}',
+                            f'{method}_{purpose}_dataset.txt')
+    else:
+        raise Exception(f"Purpose {purpose} not found")
+
+
+def find_output_filename(training_method):
+    return os.path.join(os.path.dirname(__file__), 'results',
+                        f'ml_trained_in_{training_method}.csv')
+
+
+def find_other_filename(search):
+    return os.path.join(os.path.dirname(__file__), 'config',
+                        f'{search}.txt')
diff --git a/preprocessing_Dorians_features.py b/preprocessing_Dorians_features.py
new file mode 100644
index 0000000..9165571
--- /dev/null
+++ b/preprocessing_Dorians_features.py
@@ -0,0 +1,46 @@
+"""IS THIS BEING USED?"""
+import numpy as np
+from sklearn.preprocessing import normalize
+
+
+def convert_to_numpy_floats(features):
+    return np.array([np.array([np.float64(feature)
+                    for feature in feature_list])
+                    for feature_list in features])
+
+
+def normalize_features(features):
+    """
+    Normalize each column of features.
+
+    The new media is 0 and the standard deviation is 1 in each column.
+    """
+    normal_features = []
+    for feature in zip(*features):
+        mean = np.mean(feature)
+        std = np.std(feature)
+        if std != 0:
+            normal_features.append((feature - mean) / std)
+        else:
+            normal_features.append(feature - mean)
+    return normal_features
+
+
+def normalize_features2(features):
+    """
+    Normalize each column of features.
+
+    The new media is 0 and the standard deviation is 1 in each column.
+    """
+    return normalize(features, axis=0)
+
+
+
+
+# v = convert_to_numpy_floats([[2,1,4,1,41],[3,1,142,12,1],[21,12,34,123,2]])
+# print(v[0,1])
+
+# print(normalize_features2(v)==normalize_features(v))
+
+# print(normalize_features2(v))
+# print(normalize_features(v))
\ No newline at end of file
diff --git a/test_models.py b/test_models.py
new file mode 100644
index 0000000..55de50d
--- /dev/null
+++ b/test_models.py
@@ -0,0 +1,148 @@
+import csv
+import pickle
+import importlib.util
+import numpy as np
+from sklearn import metrics
+from config.general_values import dataset_qualities
+from config.ml_models import ml_models
+from find_filename import find_output_filename
+from find_filename import find_dataset_filename
+from find_filename import find_model_filename
+# Check if 'dataset_manipulation' is installed
+if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
+    from exploit_symmetries import give_all_symmetries
+else:
+    from packages.dataset_manipulation.exploit_symmetries import give_all_symmetries
+
+
+# def test_model(trained_model_filename, test_dataset_filename):
+#     with open(trained_model_filename, 'rb') as trained_model_file:
+#         model = pickle.load(trained_model_file)
+#     with open(test_dataset_filename, 'rb') as test_dataset_file:
+#         x_test, y_test, _ = pickle.load(test_dataset_file)
+#     y_pred = model.predict(x_test)
+#     return metrics.accuracy_score(y_test, y_pred)
+
+
+def test_results(training_method):
+    output_filename = find_output_filename(training_method)
+    with open(output_filename, 'w') as output_file:
+        writer_balanced = csv.writer(output_file)
+        writer_balanced.writerow(["Name"] + dataset_qualities)
+        for ml_model in ml_models:
+            trained_model_filename = find_model_filename(training_method,
+                                                         ml_model)
+            accuracy = dict()
+            for testing_method in dataset_qualities:
+                test_dataset_filename = find_dataset_filename('Test',
+                                                              testing_method)
+                accuracy[testing_method] = test_model(trained_model_filename,
+                                                      test_dataset_filename)
+                print('testing_method', testing_method)
+                print('ml_model', ml_model)
+                print('acc', accuracy[testing_method])
+            round_accuracies = [round(acc, 2)
+                                for acc in [accuracy[method]
+                                for method in dataset_qualities]]
+            writer_balanced.writerow([ml_model + "-" + training_method] +
+                                     round_accuracies)
+
+
+def test_classifier(ml_model, testing_method='augmented'):
+    trained_model_filename = find_model_filename('classification',
+                                                 ml_model)
+    test_dataset_filename = find_dataset_filename('Test',
+                                                  testing_method)
+    with open(trained_model_filename, 'rb') as trained_model_file:
+        model = pickle.load(trained_model_file)
+    with open(test_dataset_filename, 'rb') as test_dataset_file:
+        x_test, y_test, all_timings = pickle.load(test_dataset_file)
+    chosen_indices = [return_regressor_choice(model, features) for features in x_test]
+    return compute_metrics(chosen_indices, y_test, all_timings)
+
+
+def timings_in_test(model, testing_method='augmented', training_method=None):
+    test_dataset_filename = find_dataset_filename('test',
+                                                  testing_method)
+    with open(test_dataset_filename, 'rb') as test_dataset_file:
+        x_test, _, all_timings = pickle.load(test_dataset_file)
+    if model == 'optimal':
+        t_pred = [min(timings) for timings in all_timings]
+    else:
+        trained_model_filename = find_model_filename(training_method,
+                                                     model)
+        with open(trained_model_filename, 'rb') as trained_model_file:
+            model = pickle.load(trained_model_file)
+        y_pred = model.predict(x_test)
+        # This doesn't work because agumenteed and balanced
+        # only return one timing, not 6
+        t_pred = [timings[y] for timings, y in zip(all_timings, y_pred)]
+    return t_pred
+
+
+def test_regressor(ml_model):
+    trained_model_filename = find_model_filename('regression',
+                                                 ml_model)
+    test_dataset_filename = find_dataset_filename('test',
+                                                  'regression')
+    with open(trained_model_filename, 'rb') as trained_model_file:
+        model = pickle.load(trained_model_file)
+    with open(test_dataset_filename, 'rb') as test_dataset_file:
+        x_test, y_test, all_timings = pickle.load(test_dataset_file)
+    y_pred = model.predict(x_test)
+    avg_error = sum([abs(p-t) for p, t in zip(y_pred, y_test)])/len(y_pred)
+    print(f"{ml_model} gave {avg_error}")
+
+
+def test_model(ml_model, paradigm, testing_method='augmented'):
+    trained_model_filename = find_model_filename(paradigm,
+                                                 ml_model)
+    print(trained_model_filename, paradigm, ml_model)
+    test_dataset_filename = find_dataset_filename('Test',
+                                                  testing_method)
+    with open(trained_model_filename, 'rb') as trained_model_file:
+        model = pickle.load(trained_model_file)
+    with open(test_dataset_filename, 'rb') as test_dataset_file:
+        testing_dataset = pickle.load(test_dataset_file)
+    chosen_indices = [return_regressor_choice(model, features)
+                      for features in testing_dataset['features']]
+    return compute_metrics(chosen_indices,
+                           testing_dataset['labels'],
+                           testing_dataset['timings'],
+                           testing_dataset['cells'])
+
+
+def compute_metrics(chosen_indices, labels, all_timings, all_cells):
+    metrics = dict()
+    correct = 0
+    metrics['Total time'] = 0
+    total_markup = 0
+    metrics['Completed'] = 0
+    metrics['Total cells'] = 0
+    for chosen_index, label, timings, cells in \
+            zip(chosen_indices, labels, all_timings, all_cells):
+        if chosen_index == label:
+            correct += 1
+        if timings[chosen_index] not in [30, 60]:
+            metrics['Completed'] += 1
+        metrics['Total time'] += timings[chosen_index]
+        total_markup += (timings[chosen_index]-timings[label])/(timings[label] + 1)
+        metrics['Total cells'] += cells[chosen_index]
+    total_instances = len(chosen_indices)
+    metrics['Accuracy'] = correct/total_instances
+    metrics['Markup'] = total_markup/total_instances
+    return metrics
+
+
+def return_regressor_choice(model, features):
+    features_all_symmetries = give_all_symmetries(features)
+    y_op = float('inf')
+    for index, x_features in enumerate(features_all_symmetries):
+        # print(x_features)
+        y_pred = model.predict([x_features])
+        # print(y_pred)
+        if y_op > y_pred:
+            y_op = y_pred
+            index_op = index
+    # print(index_op)
+    return index_op
diff --git a/yaml_tools.py b/yaml_tools.py
new file mode 100644
index 0000000..2af2972
--- /dev/null
+++ b/yaml_tools.py
@@ -0,0 +1,15 @@
+import yaml
+
+
+def write_yaml_to_file(py_obj, filename):
+    with open(f'{filename}.yaml', 'w',) as f:
+        yaml.dump(py_obj, f, sort_keys=False)
+    print('Written to file successfully')
+
+
+def read_yaml_from_file(filename):
+    with open(f'{filename}.yaml') as f:
+        # py_obj = yaml.safe_load(f)
+        py_obj = yaml.load(f, Loader=yaml.Loader)
+    print('Read from file successfully')
+    return py_obj