diff --git a/Heuristics/heuristics_guess.py b/Heuristics/heuristics_guess.py index 01bdc29..c7a8c00 100644 --- a/Heuristics/heuristics_guess.py +++ b/Heuristics/heuristics_guess.py @@ -10,9 +10,9 @@ from .heuristic_tools import greedy_heuristics, expensive_heuristics, create_pseudorderings, ml_models -def choose_order_given_projections(projections, method="gmods"): +def ordering_given_projections(projections, method="gmods"): '''Returns the order guessed by the heuristic requested''' - if method in greedy_heuristics or type(method) == int: + if method in greedy_heuristics or type(method) == int or method == 'T1': guess = greedy_heuristic_guess(projections, heuristic=method) return guess elif method in expensive_heuristics: diff --git a/Heuristics/heuristics_rules.py b/Heuristics/heuristics_rules.py index 60ff9bf..9d2de1e 100644 --- a/Heuristics/heuristics_rules.py +++ b/Heuristics/heuristics_rules.py @@ -52,9 +52,9 @@ def choose_variables_minimizing(degrees_list, measure='gmods', var_list=''): # elif measure == 'avegsumdeg': # sum_degrees_overall_polys = [np.average([sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered - # elif measure == 'avegavegdeg': - # aveg_degrees_overall_polys = [np.average([np.average([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. - # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(aveg_degrees_overall_polys)] # var_list is filtered + elif measure == 'avegavegdeg': + aveg_degrees_overall_polys = [np.average([np.average([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(aveg_degrees_overall_polys)] # var_list is filtered # elif measure == 'maxsumdeg': # sum_degrees_overall_polys = [max([sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. @@ -62,9 +62,9 @@ def choose_variables_minimizing(degrees_list, measure='gmods', var_list=''): elif measure == 'sumsignsumdeg': sum_degrees_overall_polys = [np.sum(np.sign([np.sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list])) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered - # elif measure == 'sumsumdeg': - # sum_degrees_overall_polys = [sum([sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. - # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + elif measure == 'sumsumdeg': + sum_degrees_overall_polys = [sum([sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered # elif measure == 'avegvegsigndeg': # sum_degrees_overall_polys = [np.average([np.average([np.sign(monomial[var]) for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered @@ -119,6 +119,8 @@ def choose_variables_minimizing(degrees_list, measure='gmods', var_list=''): def get_order_measure(heuristic, if_tie='random'): if heuristic == 'brown': order_measure = ['brown1', 'brown2', 'brown3', if_tie] + elif heuristic == 'T1': + order_measure = ['gmods', 'avegavegdeg', 'sumsumdeg'] elif type(heuristic) == int: order_measure = list(paper_all_pos[heuristic])+[if_tie] else: diff --git a/choose_hyperparams.py b/choose_hyperparams.py index bf2410c..f3beb66 100644 --- a/choose_hyperparams.py +++ b/choose_hyperparams.py @@ -1,8 +1,8 @@ import os import pickle import csv -from config.ml_models import ml_models -from config.ml_models import sklearn_models +from config.ml_models import classifiers +from config.ml_models import all_models from config.general_values import dataset_qualities from config.hyperparameters_grid import grid from sklearn.model_selection import GridSearchCV @@ -17,7 +17,7 @@ def k_folds_ml(x_train, y_train, model, random_state=0): The hyperparameters of the models are chosen using 5-fold cross validation. """ - current_classifier = sklearn_models[model] + current_classifier = all_models[model] current_grid = grid[model] rf_cv = GridSearchCV(estimator=current_classifier(), param_grid=current_grid, @@ -63,7 +63,7 @@ def choose_hyperparams(ml_model, method): # with open(output_file_normal, 'w') as f_normal: # writer_normal = csv.writer(f_normal) # writer_normal.writerow(["Name"] + dataset_qualities) -# for ml_model in ml_models: +# for ml_model in classifiers: # print(f"Model: {ml_model}") # acc_balanced = dict() # acc_normal = dict() @@ -79,7 +79,7 @@ def choose_hyperparams(ml_model, method): # os.path.join(os.path.dirname(__file__), # 'config', 'hyperparams', # f'{method}_{ml_model}')) -# current_classifier = sklearn_models[ml_model] +# current_classifier = all_models[ml_model] # clf = current_classifier(**hyperparams) # clf.fit(x_train, y_train) # acc_balanced[method] = clf.score(balanced_x_test, diff --git a/config/hyperparameters_grid.py b/config/hyperparameters_grid.py index cc0182a..6f30462 100644 --- a/config/hyperparameters_grid.py +++ b/config/hyperparameters_grid.py @@ -1,70 +1,89 @@ """Contains the grid of hyperparameters that each model will try""" grid = dict() -grid['RF'] = { +grid['RF-Classifier'] = { 'n_estimators': [200, 300, 400, 500], 'max_features': ['sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy'] } -grid['KNN'] = { - 'n_neighbors': [1,3,5,7,12], +grid['KNN-Classifier'] = { + 'n_neighbors': [1, 3, 5, 7, 12], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], - #'leaf_size': range(1, 10, 3), - #'p': range(1, 4, 1) + # 'leaf_size': range(1, 10, 3), + # 'p': range(1, 4, 1) } -grid['MLP'] = { - 'hidden_layer_sizes': [(5,5), (15,15), (20,20), (10,10,10), (20,20,20)], #[(i,i) for i in range(50, 20, 5)],# +[(i,i, i) for i in range(50, 20, 5)], +grid['MLP-Classifier'] = { + 'hidden_layer_sizes': [(5, 5), (15, 15), (20, 20), + (10, 10, 10), (20, 20, 20)], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], - 'learning_rate': ['constant','adaptive'], + 'learning_rate': ['constant', 'adaptive'], 'alpha': [0.05, 0.005], 'max_iter': [1000] } -grid['DT'] = { +grid['DT-Classifier'] = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], - 'max_depth': [1,4,7,10,13,16,19] + 'max_depth': [1, 4, 7, 10, 13, 16, 19] } -grid['SVC'] = { - 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], +grid['SVM-Classifier'] = { + 'kernel': ['rbf', 'sigmoid'], 'tol': [0.0316], - 'C': [5,100,200,300], + 'C': [5, 100, 300], 'gamma': ['scale', 'auto'] } +grid['GB-Classifier'] = { + 'n_estimators': [50, 200], + 'learning_rate': [0.01, 0.1], + 'max_depth': [3, 5], + 'min_samples_split': [2, 4], + 'min_samples_leaf': [1, 3] +} -grid['RFR'] = { +grid['RF-Regressor'] = { 'criterion': ['squared_error', 'friedman_mse'], - "max_depth": [1,3,7], - "min_samples_leaf": [1,5,10], + "max_depth": [1, 3, 7], + "min_samples_leaf": [1, 5, 10], } -grid['KNNR'] = { +grid['KNN-Regressor'] = { 'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } -grid['MLPR'] = { +grid['MLP-Regressor'] = { 'hidden_layer_sizes': [(100,), (20, 20), (10, 10, 10)], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['adam', 'sgd'], 'alpha': [0.0001, 0.001, 0.01] } -grid['DTR'] = { - "splitter":["best","random"], - "max_depth" : [1,3,7,12], - "min_samples_leaf":[1,5,10], +grid['DT-Regressor'] = { + "splitter": ["best", "random"], + "max_depth": [1, 3, 7, 12], + "min_samples_leaf": [1, 5, 10], # "min_weight_fraction_leaf":[0.1,0.5,0.9], # "max_features":["auto","log2","sqrt",None], # "max_leaf_nodes":[None,10,50,90] } -grid['SVR'] = { - 'kernel': ('linear', 'rbf','poly'), - 'C':[1.5, 10], - 'gamma': [1e-7, 1e-4], - 'epsilon':[0.1,0.2,0.5,0.3] +grid['SVM-Regressor'] = { + 'kernel': ['rbf'], + 'C': [0.1, 1, 10], + 'gamma': [1e-4, 1e-3, 1e-2], + 'epsilon': [0.1, 0.2] +# 'kernel': ('linear', 'rbf', 'poly'), +# 'C': [1.5, 10], +# 'gamma': [1e-7, 1e-4], +# 'epsilon': [0.1, 0.2, 0.5] +} +grid['GB-Regressor'] = { + 'n_estimators': [50, 200], + 'learning_rate': [0.01, 0.1], + 'max_depth': [3, 5], + 'min_samples_split': [2, 4], + 'min_samples_leaf': [1, 3] } grid['SGD'] = { - 'loss':["squared_error", "huber", "epsilon_insensitive"], - 'penalty':["l2", "l1", "elasticnet"] + 'loss': ["squared_error", "huber", "epsilon_insensitive"], + 'penalty': ["l2", "l1", "elasticnet"] } diff --git a/config/hyperparams/augmented_DT.yaml b/config/hyperparams/augmented_DT.yaml deleted file mode 100644 index 5e7ea5d..0000000 --- a/config/hyperparams/augmented_DT.yaml +++ /dev/null @@ -1,3 +0,0 @@ -criterion: gini -max_depth: 19 -splitter: random diff --git a/config/hyperparams/augmented_KNN.yaml b/config/hyperparams/augmented_KNN.yaml deleted file mode 100644 index ea5b9b4..0000000 --- a/config/hyperparams/augmented_KNN.yaml +++ /dev/null @@ -1,3 +0,0 @@ -algorithm: auto -n_neighbors: 12 -weights: distance diff --git a/config/hyperparams/augmented_MLP.yaml b/config/hyperparams/augmented_MLP.yaml deleted file mode 100644 index fca174d..0000000 --- a/config/hyperparams/augmented_MLP.yaml +++ /dev/null @@ -1,8 +0,0 @@ -activation: tanh -alpha: 0.005 -hidden_layer_sizes: !!python/tuple -- 20 -- 20 -learning_rate: constant -max_iter: 1000 -solver: adam diff --git a/config/hyperparams/augmented_RF.yaml b/config/hyperparams/augmented_RF.yaml deleted file mode 100644 index 94d3a0f..0000000 --- a/config/hyperparams/augmented_RF.yaml +++ /dev/null @@ -1,6 +0,0 @@ -class_weight: null -criterion: entropy -max_depth: null -min_samples_leaf: 1 -min_samples_split: 2 -n_estimators: 200 diff --git a/config/hyperparams/augmented_SVC.yaml b/config/hyperparams/augmented_SVC.yaml deleted file mode 100644 index 505a4fd..0000000 --- a/config/hyperparams/augmented_SVC.yaml +++ /dev/null @@ -1,4 +0,0 @@ -C: 100 -gamma: auto -kernel: rbf -tol: 0.0316 diff --git a/config/hyperparams/bal_DT.yaml b/config/hyperparams/bal_DT.yaml deleted file mode 100644 index da4ceb5..0000000 --- a/config/hyperparams/bal_DT.yaml +++ /dev/null @@ -1,3 +0,0 @@ -criterion: gini -max_depth: 7 -splitter: best diff --git a/config/hyperparams/bal_KNN.yaml b/config/hyperparams/bal_KNN.yaml deleted file mode 100644 index 710b5f6..0000000 --- a/config/hyperparams/bal_KNN.yaml +++ /dev/null @@ -1,3 +0,0 @@ -algorithm: auto -n_neighbors: 1 -weights: uniform diff --git a/config/hyperparams/bal_MLP.yaml b/config/hyperparams/bal_MLP.yaml deleted file mode 100644 index a4fb4e4..0000000 --- a/config/hyperparams/bal_MLP.yaml +++ /dev/null @@ -1,8 +0,0 @@ -activation: tanh -alpha: 0.005 -hidden_layer_sizes: !!python/tuple -- 20 -- 20 -learning_rate: adaptive -max_iter: 1000 -solver: adam diff --git a/config/hyperparams/balanced_DT.yaml b/config/hyperparams/balanced_DT.yaml deleted file mode 100644 index 82d03ca..0000000 --- a/config/hyperparams/balanced_DT.yaml +++ /dev/null @@ -1,3 +0,0 @@ -criterion: gini -max_depth: 4 -splitter: best diff --git a/config/hyperparams/balanced_KNN.yaml b/config/hyperparams/balanced_KNN.yaml deleted file mode 100644 index 6b4c149..0000000 --- a/config/hyperparams/balanced_KNN.yaml +++ /dev/null @@ -1,3 +0,0 @@ -algorithm: ball_tree -n_neighbors: 1 -weights: uniform diff --git a/config/hyperparams/balanced_MLP.yaml b/config/hyperparams/balanced_MLP.yaml deleted file mode 100644 index a4fb4e4..0000000 --- a/config/hyperparams/balanced_MLP.yaml +++ /dev/null @@ -1,8 +0,0 @@ -activation: tanh -alpha: 0.005 -hidden_layer_sizes: !!python/tuple -- 20 -- 20 -learning_rate: adaptive -max_iter: 1000 -solver: adam diff --git a/config/hyperparams/balanced_RF.yaml b/config/hyperparams/balanced_RF.yaml deleted file mode 100644 index 89df26a..0000000 --- a/config/hyperparams/balanced_RF.yaml +++ /dev/null @@ -1,6 +0,0 @@ -class_weight: balanced -criterion: entropy -max_depth: 20 -min_samples_leaf: 2 -min_samples_split: 5 -n_estimators: 50 diff --git a/config/hyperparams/balanced_SVC.yaml b/config/hyperparams/balanced_SVC.yaml deleted file mode 100644 index 505a4fd..0000000 --- a/config/hyperparams/balanced_SVC.yaml +++ /dev/null @@ -1,4 +0,0 @@ -C: 100 -gamma: auto -kernel: rbf -tol: 0.0316 diff --git a/config/hyperparams/basic_DT.yaml b/config/hyperparams/basic_DT.yaml deleted file mode 100644 index 7e09a7f..0000000 --- a/config/hyperparams/basic_DT.yaml +++ /dev/null @@ -1,3 +0,0 @@ -criterion: entropy -max_depth: 19 -splitter: random diff --git a/config/hyperparams/basic_KNN.yaml b/config/hyperparams/basic_KNN.yaml deleted file mode 100644 index d7863e4..0000000 --- a/config/hyperparams/basic_KNN.yaml +++ /dev/null @@ -1,3 +0,0 @@ -algorithm: auto -n_neighbors: 7 -weights: distance diff --git a/config/hyperparams/basic_MLP.yaml b/config/hyperparams/basic_MLP.yaml deleted file mode 100644 index b3a62b5..0000000 --- a/config/hyperparams/basic_MLP.yaml +++ /dev/null @@ -1,8 +0,0 @@ -activation: tanh -alpha: 0.05 -hidden_layer_sizes: !!python/tuple -- 20 -- 20 -learning_rate: constant -max_iter: 1000 -solver: adam diff --git a/config/hyperparams/basic_RF.yaml b/config/hyperparams/basic_RF.yaml deleted file mode 100644 index 5359ae0..0000000 --- a/config/hyperparams/basic_RF.yaml +++ /dev/null @@ -1,5 +0,0 @@ -criterion: entropy -max_depth: 8 -max_features: sqrt -n_estimators: 200 -random_state: 18 diff --git a/config/hyperparams/basic_SVC.yaml b/config/hyperparams/basic_SVC.yaml deleted file mode 100644 index 505a4fd..0000000 --- a/config/hyperparams/basic_SVC.yaml +++ /dev/null @@ -1,4 +0,0 @@ -C: 100 -gamma: auto -kernel: rbf -tol: 0.0316 diff --git a/config/hyperparams/normal_DT.yaml b/config/hyperparams/normal_DT.yaml deleted file mode 100644 index bfd1d81..0000000 --- a/config/hyperparams/normal_DT.yaml +++ /dev/null @@ -1,3 +0,0 @@ -criterion: gini -max_depth: 10 -splitter: random diff --git a/config/hyperparams/normal_KNN.yaml b/config/hyperparams/normal_KNN.yaml deleted file mode 100644 index b1680ba..0000000 --- a/config/hyperparams/normal_KNN.yaml +++ /dev/null @@ -1,3 +0,0 @@ -algorithm: auto -n_neighbors: 5 -weights: distance diff --git a/config/hyperparams/normal_MLP.yaml b/config/hyperparams/normal_MLP.yaml deleted file mode 100644 index a4fb4e4..0000000 --- a/config/hyperparams/normal_MLP.yaml +++ /dev/null @@ -1,8 +0,0 @@ -activation: tanh -alpha: 0.005 -hidden_layer_sizes: !!python/tuple -- 20 -- 20 -learning_rate: adaptive -max_iter: 1000 -solver: adam diff --git a/config/hyperparams/normal_RF.yaml b/config/hyperparams/normal_RF.yaml deleted file mode 100644 index 94d3a0f..0000000 --- a/config/hyperparams/normal_RF.yaml +++ /dev/null @@ -1,6 +0,0 @@ -class_weight: null -criterion: entropy -max_depth: null -min_samples_leaf: 1 -min_samples_split: 2 -n_estimators: 200 diff --git a/config/hyperparams/normal_SVC.yaml b/config/hyperparams/normal_SVC.yaml deleted file mode 100644 index 505a4fd..0000000 --- a/config/hyperparams/normal_SVC.yaml +++ /dev/null @@ -1,4 +0,0 @@ -C: 100 -gamma: auto -kernel: rbf -tol: 0.0316 diff --git a/config/ml_models.py b/config/ml_models.py index 8db0663..5a35d1b 100644 --- a/config/ml_models.py +++ b/config/ml_models.py @@ -5,38 +5,34 @@ from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor from sklearn.neural_network import MLPRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor +from sklearn.ensemble import GradientBoostingRegressor -ml_models = [ - 'KNN', - 'DT', - 'SVC', - 'RF', - 'MLP' - ] -ml_regressors = [ - 'DTR', - 'SVR', - 'RFR', - 'KNNR', - 'MLPR' - ] +classifiers = { + 'DT-Classifier': DecisionTreeClassifier, + 'KNN-Classifier': KNeighborsClassifier, + 'RF-Classifier': RandomForestClassifier, + 'SVM-Classifier': SVC, + 'MLP-Classifier': MLPClassifier, + # 'GB-Classifier': GradientBoostingClassifier +} -sklearn_models = { - 'DT': DecisionTreeClassifier, - 'KNN': KNeighborsClassifier, - 'RF': RandomForestClassifier, - 'SVC': SVC, - 'MLP': MLPClassifier, - 'DTR': DecisionTreeRegressor, - 'KNNR': KNeighborsRegressor, - 'RFR': RandomForestRegressor, - 'SVR': SVR, - 'MLPR': MLPRegressor +regressors = { + 'DT-Regressor': DecisionTreeRegressor, + 'KNN-Regressor': KNeighborsRegressor, + 'RF-Regressor': RandomForestRegressor, + 'SVM-Regressor': SVR, + 'MLP-Regressor': MLPRegressor, + # 'GB-Regressor': GradientBoostingRegressor } + +all_models = {**classifiers, **regressors} + +heuristics = []#'T1', 'gmods', 'brown', 'random', 'virtual-best'] diff --git a/find_filename.py b/find_filename.py index feb911e..bd62c25 100644 --- a/find_filename.py +++ b/find_filename.py @@ -3,16 +3,16 @@ from config.general_values import purposes -def find_hyperparams_filename(method, ml_model): +def find_hyperparams_filename(model_name, paradigm, training_quality): return os.path.join(os.path.dirname(__file__), 'config', 'hyperparams', - f'{method}_{ml_model}') + f'{model_name}-{paradigm}-{training_quality}') -def find_model_filename(method, ml_model): +def find_model_filename(model_name, paradigm, training_quality): return os.path.join(os.path.dirname(__file__), 'config', 'models', - f'{method}_{ml_model}.txt') + f'{model_name}-{paradigm}-{training_quality}.txt') def find_dataset_filename(purpose, method=None): diff --git a/from_poly_set_to_features.py b/from_poly_set_to_features.py index 9074189..86d7482 100644 --- a/from_poly_set_to_features.py +++ b/from_poly_set_to_features.py @@ -118,8 +118,6 @@ def get_standarized_features(names, features): standarizing_values = pickle.load(standarizing_values_file) # we keep only the features that are unique standarized_features = [] - # for featurex in zip(*features): - # print(type(featurex), len(features)) index = 0 for index, feature in enumerate(zip(*features)): mean, std = standarizing_values[names[index]] diff --git a/main.py b/main.py index f5c052d..25c62af 100644 --- a/main.py +++ b/main.py @@ -30,7 +30,7 @@ # Hyperparameter tuning take a very long time, # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned -tune_hyperparameters = False +tune_hyperparameters = True train_the_models = True paradigm = 'classification' diff --git a/main_heuristics.py b/main_heuristics.py index dd56e27..462b341 100644 --- a/main_heuristics.py +++ b/main_heuristics.py @@ -4,117 +4,95 @@ import random # import numpy as np from Heuristics.heuristics_guess import not_greedy_heuristic_guess -from Heuristics.heuristics_guess import choose_order_given_projections +from Heuristics.heuristics_guess import ordering_given_projections from find_filename import find_dataset_filename from test_models import compute_metrics +from config.ml_models import heuristics random.seed(0) nvar = 3 testing_method = 'Normal' -test_dataset_filename = find_dataset_filename('Test', - testing_method) -with open(test_dataset_filename, 'rb') as test_dataset_file: - testing_dataset = pickle.load(test_dataset_file) -output_file = "heuristics_output_acc_time.csv" - -# TESTING GMODS IN AUUGMENTED : Features 2, 67 and 132 -def choose_gmods(features): - a = [] - # # print(features) - # a.append(features[2]) - # a.append(features[67]) - # a.append(features[132]) - if a[0]==min(a): - if a[1]<=a[2]: - return 0 - else: - return 1 - elif a[1]==min(a): - if a[0]<=a[2]: - return 2 - else: - return 3 - elif a[2]==min(a): - if a[0]<=a[1]: - return 4 - else: - return 5 +# # TESTING GMODS IN AUUGMENTED : Features 2, 67 and 132 +# def choose_gmods(features): +# a = [] +# # # print(features) +# # a.append(features[2]) +# # a.append(features[67]) +# # a.append(features[132]) +# if a[0] == min(a): +# if a[1] <= a[2]: +# return 0 +# else: +# return 1 +# elif a[1] == min(a): +# if a[0] <= a[2]: +# return 2 +# else: +# return 3 +# elif a[2]==min(a): +# if a[0]<=a[1]: +# return 4 +# else: +# return 5 -# Testing in heuristics that make all the choice at once -first_heuristic = 1 -for heuristic in ['T1', 'gmods', 'brown', 'random', 'virtual-best']: -# for heuristic in ['gmods', 'virtual best']: - reps = 100 - sum_metrics = dict() - for i in range(reps): - if heuristic == 'virtual-best': - # chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] - chosen_indices = testing_dataset['labels'] - elif heuristic == 'random': - chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] - else: - chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic) +def ordering_choices_heuristics(heuristic, testing_dataset, greedy=False): + if heuristic == 'virtual-best': + chosen_indices = testing_dataset['labels'] + elif heuristic == 'random': + chosen_indices = [random.randint(0, len(timings)-1) + for timings in testing_dataset['timings']] + else: + if greedy: + chosen_indices = [ordering_given_projections(projection, heuristic) for projection in testing_dataset['projections']] - # chosen_indices = [choose_gmods(features) - # for features in testing_dataset['features']] - metrics = compute_metrics(chosen_indices, - testing_dataset['labels'], - testing_dataset['timings'], - testing_dataset['cells'], - heuristic) - if len(sum_metrics) == 0: - sum_metrics = metrics else: - sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} - aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} - augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} + chosen_indices = [not_greedy_heuristic_guess(projection[0][0], + heuristic) + for projection in testing_dataset['projections']] + return chosen_indices - print(heuristic, augmented_metrics) - if first_heuristic == 1: - first_heuristic = 0 - keys = list(augmented_metrics.keys()) - with open(output_file, 'a') as f: - f.write('Choosing the whole ordering in the beggining \n') - f.write(', '.join(['Model'] + keys) + '\n') - with open(output_file, 'a', newline='') as f: - writer = csv.writer(f) - writer.writerow([heuristic] + [augmented_metrics[key] for key in keys]) -# # Testing on greedy heuristics -# for heuristic in ['brown', 'gmods', 'random', 'virtual best']: -# reps = 100 -# sum_metrics = dict() -# for i in range(reps): -# if heuristic == 'virtual best': -# chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] -# elif heuristic == 'random': -# chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] -# else: -# chosen_indices = [choose_order_given_projections(projection, heuristic) -# for projection in testing_dataset['projections']] -# metrics = compute_metrics(chosen_indices, -# testing_dataset['labels'], -# testing_dataset['timings'], -# testing_dataset['cells'], -# heuristic) -# if len(sum_metrics) == 0: -# sum_metrics = metrics -# else: -# sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} -# aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} -# augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} - -# print(heuristic, augmented_metrics) -# if first_heuristic == 1: -# first_heuristic = 0 -# keys = list(augmented_metrics.keys()) -# with open(output_file, 'a') as f: -# f.write('Now choosing greedily \n') -# f.write(', '.join(['Model'] + keys) + '\n') -# with open(output_file, 'a', newline='') as f: -# writer = csv.writer(f) -# writer.writerow([heuristic] + [augmented_metrics[key] for key in keys]) -# # print(sum(min(timings) for timings in testing_dataset['timings'])) +if __name__ == "__main__": + test_dataset_filename = find_dataset_filename('Test', + testing_method) + with open(test_dataset_filename, 'rb') as test_dataset_file: + testing_dataset = pickle.load(test_dataset_file) + output_file = "heuristics_output_acc_time.csv" + + # Testing in heuristics that make all the choice at once + first_heuristic = 1 + for greedy in [True, False]: + for heuristic in heuristics: + # for heuristic in ['gmods', 'virtual best']: + reps = 100 + for i in range(reps): + chosen_indices = ordering_choices_heuristics(heuristic, + testing_dataset, + greedy=greedy) + metrics = compute_metrics(chosen_indices, + testing_dataset) + if i == 0: + sum_metrics = metrics + else: + sum_metrics = {key: metrics[key] + sum_metrics[key] + for key in metrics} + aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} + augmented_metrics = {key: aveg_metrics[key] + if key in ['Accuracy', 'Markup'] + else math.factorial(nvar)*aveg_metrics[key] + for key in sum_metrics} + + print('not-'*(not greedy) + 'greedy-' + heuristic, + augmented_metrics) + if first_heuristic == 1: + first_heuristic = 0 + keys = list(augmented_metrics.keys()) + with open(output_file, 'a') as f: + f.write(', '.join(['Model'] + keys) + '\n') + with open(output_file, 'a', newline='') as f: + writer = csv.writer(f) + writer.writerow(['not-'*(not greedy) + 'greedy-' + heuristic] + + [augmented_metrics[key] for key in keys]) diff --git a/main_regression.py b/main_regression.py index 316fa8c..03699c9 100644 --- a/main_regression.py +++ b/main_regression.py @@ -57,7 +57,7 @@ # C:\Software\Python37\Lib\site-packages\sklearn\neighbors\_regression.py print(f"Testing models trained in {ml_model}") metrics = test_model(ml_model, paradigm=paradigm, - testing_method=testing_method) + testing_method=testing_method) if first_time == 1: first_time = 0 keys = list(metrics.keys()) diff --git a/make_plots.py b/make_plots.py index 8b04352..105cd29 100644 --- a/make_plots.py +++ b/make_plots.py @@ -96,4 +96,4 @@ def create_adversarial_plot( plt.cla() -create_adversarial_plot() +# create_adversarial_plot() diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py index 62dff14..8e685e9 100644 --- a/packages/dataset_manipulation/dataset_manipulation.py +++ b/packages/dataset_manipulation/dataset_manipulation.py @@ -28,7 +28,6 @@ def augmentate_instance(features, timings, cells, nvar): return augmented_features, augmented_timings, augmented_cells - def augmentate_dataset(all_features, all_timings, all_cells, nvar): """ Multiply the size of the dataset by math.factorial(nvar). diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index ae145aa..832c44b 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -57,8 +57,6 @@ def extract_features(dataset): all_original_polynomials = [] all_projections = [] all_cells = [] - for index, elem in enumerate(dataset): - print(index, elem[0]) for index, projections in enumerate(dataset[0]): all_projections.append(projections) original_polynomials = projections[0][0] diff --git a/test_models.py b/test_models.py index 92ff35d..ef32a4b 100644 --- a/test_models.py +++ b/test_models.py @@ -5,8 +5,10 @@ import numpy as np from sklearn import metrics from config.general_values import dataset_qualities -from config.ml_models import ml_models -from config.ml_models import ml_regressors +from config.ml_models import all_models +from config.ml_models import regressors +from config.ml_models import classifiers +from config.ml_models import heuristics from find_filename import find_output_filename from find_filename import find_dataset_filename from find_filename import find_model_filename @@ -34,7 +36,7 @@ def test_results(training_method): with open(output_filename, 'w') as output_file: writer_balanced = csv.writer(output_file) writer_balanced.writerow(["Name"] + dataset_qualities) - for ml_model in ml_models: + for ml_model in all_models: trained_model_filename = find_model_filename(training_method, ml_model) accuracy = dict() @@ -97,27 +99,15 @@ def test_regressor(ml_model): def test_model(ml_model, paradigm, testing_method='augmented'): - trained_model_filename = find_model_filename(paradigm, - ml_model) - # print(trained_model_filename, paradigm, ml_model) test_dataset_filename = find_dataset_filename('Test', testing_method) - with open(trained_model_filename, 'rb') as trained_model_file: - model = pickle.load(trained_model_file) with open(test_dataset_filename, 'rb') as test_dataset_file: testing_dataset = pickle.load(test_dataset_file) - if ml_model in ml_regressors and paradigm == 'regression': - chosen_indices = [return_regressor_choice(model, features) - for features in testing_dataset['features']] - elif ml_model in ml_models: - # print('testing_method', testing_method) - chosen_indices = [model.predict([features])[0] - for features in testing_dataset['features']] - elif paradigm == 'reinforcement' and testing_method == 'Normal': - chosen_indices = [ordering_choice_reinforcement(model, projections) - for projections in testing_dataset['projections']] - # print(chosen_indices) - # print("here2") + trained_model_filename = find_model_filename(paradigm, + ml_model) + with open(trained_model_filename, 'rb') as trained_model_file: + model = pickle.load(trained_model_file) + chosen_indices = choose_indices(model, testing_dataset) return compute_metrics(chosen_indices, testing_dataset['labels'], testing_dataset['timings'], @@ -125,7 +115,28 @@ def test_model(ml_model, paradigm, testing_method='augmented'): ml_model) -def compute_metrics(chosen_indices, labels, all_timings, all_cells, model): +def choose_indices(ml_model, dataset, paradigm=''): + trained_model_filename = find_model_filename(paradigm, ml_model) + with open(trained_model_filename, 'rb') as trained_model_file: + model = pickle.load(trained_model_file) + if ml_model in regressors: + chosen_indices = [return_regressor_choice(model, features) + for features in dataset['features']] + elif ml_model in classifiers: + chosen_indices = [model.predict([features])[0] + for features in dataset['features']] + elif paradigm == 'reinforcement': + chosen_indices = [ordering_choice_reinforcement(model, projections) + for projections in dataset['projections']] + elif ml_model in heuristics: + ordering_choices_heuristics(model, dataset) + return chosen_indices + + +def compute_metrics(chosen_indices, testing_dataset): + labels = testing_dataset['labels'] + all_timings = testing_dataset['timings'] + all_cells = testing_dataset['cells'] metrics = dict() correct = 0 metrics['TotalTime'] = 0 @@ -143,9 +154,6 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells, model): metrics['TotalCells'] += cells[chosen_index] chosen_times = [timings[index] for index, timings in zip(chosen_indices, all_timings)] - timings_lists_filename = find_timings_lists(model) - with open(timings_lists_filename, 'wb') as timings_lists_file: - pickle.dump(chosen_times, timings_lists_file) metrics['TotalTime'] = sum(chosen_times) total_instances = len(chosen_indices) metrics['Accuracy'] = correct/total_instances diff --git a/test_train_datasets.py b/test_train_datasets.py index 7279244..0cc51b1 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -116,11 +116,9 @@ def create_regression_datasets(taking_logarithms=True): # we will use the augmented dataset here with open(this_dataset_filename, 'rb') as this_dataset_file: regression_dataset = pickle.load(this_dataset_file) - # print("regression_dataset['timings']", len(regression_dataset['timings']), regression_dataset['timings']) regression_dataset['labels'] = \ [timings[0] for timings in regression_dataset['timings']] - # print("regression_dataset['labels']", len(regression_dataset['labels']), regression_dataset['labels']) if taking_logarithms: regression_dataset['labels'] = \ [log(label) for label @@ -133,7 +131,6 @@ def create_regression_datasets(taking_logarithms=True): # classification_dataset['labels'] = \ # [np.argmin(timings) for timings # in regression_dataset['timings']] - # print(classification_dataset['labels']) # create_regression_datasets(taking_logarithms=False) diff --git a/train_models.py b/train_models.py index bde4209..89f685a 100644 --- a/train_models.py +++ b/train_models.py @@ -2,15 +2,14 @@ import pickle import random from yaml_tools import read_yaml_from_file -from config.ml_models import sklearn_models -from config.ml_models import ml_regressors +from config.ml_models import all_models from find_filename import find_dataset_filename from find_filename import find_hyperparams_filename from find_filename import find_model_filename from find_filename import find_other_filename from dataset_manipulation import give_all_symmetries import numpy as np -from sklearn import metrics +# from sklearn import metrics from itertools import combinations from replicating_Dorians_features import compute_features_for_var from test_models import compute_metrics @@ -22,13 +21,16 @@ def train_model(ml_model, method): with open(train_data_filename, 'rb') as train_data_file: train_dataset = pickle.load(train_data_file) hyperparams = read_yaml_from_file(hyperparams_file) - current_model = sklearn_models[ml_model] + current_model = all_models[ml_model] model = current_model(**hyperparams) # model = current_model() + print('here') model.fit(train_dataset['features'], train_dataset['labels']) trained_model_filename = find_model_filename(method, ml_model) + print('here2') with open(trained_model_filename, 'wb') as trained_model_file: pickle.dump(model, trained_model_file) + return model def train_regression_model(ml_model, method): @@ -75,7 +77,7 @@ def train_reinforcement_model(ml_model, method='Normal'): train_dataset = pickle.load(train_data_file) # hyperparams_file = find_hyperparams_filename(method, ml_model) # hyperparams = read_yaml_from_file(hyperparams_file) - current_model = sklearn_models[ml_model] + current_model = all_models[ml_model] # model = current_model(**hyperparams) model = current_model() first_polys = train_dataset['projections'][0][0][0] @@ -148,7 +150,10 @@ def var_choice_reinforcement(model, polynomials): chosen by the model trained using reinforcement''' vars_features = get_vars_features(polynomials) evaluations = model.predict(vars_features) - return np.argmin(evaluations) + min_value = np.min(evaluations) + min_indices = np.where(evaluations == min_value)[0] + # Randomly select one of the minimal indices + return np.random.choice(min_indices) def ordering_choice_reinforcement(model, projections):