diff --git a/choose_hyperparams.py b/choose_hyperparams.py index 8a9a54c..bf2410c 100644 --- a/choose_hyperparams.py +++ b/choose_hyperparams.py @@ -31,10 +31,11 @@ def k_folds_ml(x_train, y_train, model, random_state=0): def choose_hyperparams(ml_model, method): """Given a ml_model and a method, a file with the hyperparameters chosen by cross validation is created""" - this_dataset_file = find_dataset_filename('train', method=method) + this_dataset_file = find_dataset_filename('Train', method=method) with open(this_dataset_file, 'rb') as f: - x_train, y_train, _ = pickle.load(f) - hyperparams = k_folds_ml(x_train, y_train, model=ml_model) + dataset = pickle.load(f) + hyperparams = k_folds_ml(dataset['features'], dataset['labels'], model=ml_model) + print(hyperparams) hyperparams_filename = find_hyperparams_filename(method, ml_model) print(hyperparams_filename) write_yaml_to_file(hyperparams, hyperparams_filename) diff --git a/config/hyperparameters_grid.py b/config/hyperparameters_grid.py index abbef04..af84889 100644 --- a/config/hyperparameters_grid.py +++ b/config/hyperparameters_grid.py @@ -2,10 +2,16 @@ grid = dict() grid['RF'] = { - 'n_estimators': [200, 300, 400, 500], - 'max_features': ['sqrt', 'log2'], - 'max_depth': [4, 5, 6, 7, 8], - 'criterion': ['gini', 'entropy'] + # 'n_estimators': [200, 300, 400, 500], + # 'max_features': ['sqrt', 'log2'], + # 'max_depth': [4, 5, 6, 7, 8], + # 'criterion': ['gini', 'entropy'] + 'n_estimators': [50, 100, 200], + 'criterion': ['gini', 'entropy'], + 'max_depth': [None, 10, 20, 30], + 'min_samples_split': [2, 5, 10], + 'min_samples_leaf': [1, 2, 4], + 'class_weight': [None, 'balanced'], } grid['KNN'] = { 'n_neighbors': [1,3,5,7,12], @@ -35,9 +41,18 @@ } grid['RFR'] = { - 'criterion': ['squared_error', 'friedman_mse'], - "max_depth": [1,3,7], - "min_samples_leaf": [1,5,10], + # 'n_estimators': [200, 300, 400, 500], + # 'max_features': ['sqrt', 'log2'], + # 'max_depth': [4, 5, 6, 7, 8], + # 'criterion': ['squared_error', 'entropy'] + # # 'criterion': ['squared_error', 'friedman_mse'], + # # "max_depth": [1,3,7], + # # "min_samples_leaf": [1,5,10], + 'n_estimators': [50, 100, 200], + 'criterion': ['mse', 'mae'], + 'max_depth': [None, 10, 20, 30], + 'min_samples_split': [2, 5, 10], + 'min_samples_leaf': [1, 2, 4], } grid['KNNR'] = { 'n_neighbors': [3, 5, 10], diff --git a/config/hyperparams/augmented_RF.yaml b/config/hyperparams/augmented_RF.yaml index f7828fb..94d3a0f 100644 --- a/config/hyperparams/augmented_RF.yaml +++ b/config/hyperparams/augmented_RF.yaml @@ -1,5 +1,6 @@ +class_weight: null criterion: entropy -max_depth: 8 -max_features: sqrt -n_estimators: 500 -random_state: 18 +max_depth: null +min_samples_leaf: 1 +min_samples_split: 2 +n_estimators: 200 diff --git a/config/hyperparams/balanced_RF.yaml b/config/hyperparams/balanced_RF.yaml index 710fc36..89df26a 100644 --- a/config/hyperparams/balanced_RF.yaml +++ b/config/hyperparams/balanced_RF.yaml @@ -1,5 +1,6 @@ -criterion: gini -max_depth: 8 -max_features: sqrt -n_estimators: 500 -random_state: 18 +class_weight: balanced +criterion: entropy +max_depth: 20 +min_samples_leaf: 2 +min_samples_split: 5 +n_estimators: 50 diff --git a/config/hyperparams/normal_RF.yaml b/config/hyperparams/normal_RF.yaml index 5359ae0..94d3a0f 100644 --- a/config/hyperparams/normal_RF.yaml +++ b/config/hyperparams/normal_RF.yaml @@ -1,5 +1,6 @@ +class_weight: null criterion: entropy -max_depth: 8 -max_features: sqrt +max_depth: null +min_samples_leaf: 1 +min_samples_split: 2 n_estimators: 200 -random_state: 18 diff --git a/config/ml_models.py b/config/ml_models.py index 8db0663..3cdc1b8 100644 --- a/config/ml_models.py +++ b/config/ml_models.py @@ -13,19 +13,19 @@ from sklearn.neighbors import KNeighborsRegressor ml_models = [ - 'KNN', - 'DT', - 'SVC', + # 'KNN', + # 'DT', + # 'SVC', 'RF', - 'MLP' + # 'MLP' ] ml_regressors = [ - 'DTR', - 'SVR', + # 'DTR', + # 'SVR', 'RFR', - 'KNNR', - 'MLPR' + # 'KNNR', + # 'MLPR' ] sklearn_models = { diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt index b7d2456..f9786e1 100644 Binary files a/datasets/clean_dataset.txt and b/datasets/clean_dataset.txt differ diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index 36c4323..e0c076c 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index 1a6fd17..a0cb9d7 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt index 5b49302..9c61a27 100644 Binary files a/datasets/test/normal_test_dataset.txt and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index a52fea7..f1bfddc 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 28fd1ac..0f172bc 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt index 347a778..795a448 100644 Binary files a/datasets/train/normal_train_dataset.txt and b/datasets/train/normal_train_dataset.txt differ diff --git a/find_filename.py b/find_filename.py index 572e03a..ba7ec1e 100644 --- a/find_filename.py +++ b/find_filename.py @@ -20,9 +20,9 @@ def find_dataset_filename(purpose, method=None): return os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt') - # 'dataset_with_repetition_return_ncells.txt') - # for returning "repeated" instances - # those with the same number of cells for all projections + # 'dataset_with_repetition_return_ncells.txt') + # for returning "repeated" instances + # those with the same number of cells for all projections elif purpose == "clean": return os.path.join(os.path.dirname(__file__), 'datasets', diff --git a/main.py b/main.py index 8f71563..8b234df 100644 --- a/main.py +++ b/main.py @@ -30,17 +30,17 @@ # Hyperparameter tuning take a very long time, # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned -tune_hyperparameters = False +tune_hyperparameters = True paradigm = 'classification' -cleaning_dataset() -create_train_test_datasets() +# cleaning_dataset() +# create_train_test_datasets() -# if tune_hyperparameters: -# for ml_model in ml_models: -# for method in dataset_qualities: -# print(f"Choosing hyperparameters for {ml_model} in {method}") -# choose_hyperparams(ml_model, method) +if tune_hyperparameters: + for ml_model in ml_models: + for method in dataset_qualities: + print(f"Choosing hyperparameters for {ml_model} in {method}") + choose_hyperparams(ml_model, method) for ml_model in ml_models: print(f"Training {ml_model}") for method in dataset_qualities: @@ -59,7 +59,7 @@ first_time = 0 keys = list(metrics.keys()) with open(output_file, 'a') as f: - f.write('Now really NO more cheating\n') + f.write('No hyperparameters\n') f.write(', '.join(['Model'] + keys) + '\n') with open(output_file, 'a', newline='') as f: writer = csv.writer(f) diff --git a/main_regression.py b/main_regression.py index fa30ab4..4f36752 100644 --- a/main_regression.py +++ b/main_regression.py @@ -26,43 +26,44 @@ # Hyperparameter tuning take a very long time, # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned -tune_hyperparameters = False +tune_hyperparameters = True taking_logarithms = False +for i in range(1): + # cleaning_dataset() + # create_train_test_datasets() + # create_regression_datasets(taking_logarithms=taking_logarithms) -# cleaning_dataset() -# create_train_test_datasets() -create_regression_datasets(taking_logarithms=taking_logarithms) + paradigm = "regression" + if tune_hyperparameters: + for ml_model in ml_regressors: + print(f"Choosing hyperparameters for {ml_model} in {paradigm}") + choose_hyperparams(ml_model, paradigm) -paradigm = "regression" -# if tune_hyperparameters: -# for ml_model in ml_regressors: -# print(f"Choosing hyperparameters for {ml_model} in {paradigm}") -# choose_hyperparams(ml_model, paradigm) -for ml_model in ml_regressors: - print(f"Training {ml_model}") - print(f"for {paradigm}") - train_model(ml_model, paradigm) -testing_method = 'augmented' -output_file = "regression_output_acc_time.csv" -# with open(output_file, 'a') as f: -# f.write("Now without logarithms and without aveg_not_zero\n") + for ml_model in ml_regressors: + print(f"Training {ml_model}") + print(f"for {paradigm}") + train_model(ml_model, paradigm) + testing_method = 'augmented' + output_file = "regression_output_acc_time.csv" + # with open(output_file, 'a') as f: + # f.write("Now without logarithms and without aveg_not_zero\n") -first_time = 1 -for ml_model in ml_regressors: - ### - # For KNNR running properly X.shape[0] has been changed to len(X) - # in line 240 of - # C:\Software\Python37\Lib\site-packages\sklearn\neighbors\_regression.py - print(f"Testing models trained in {ml_model}") - metrics = test_model(ml_model, paradigm=paradigm, - testing_method=testing_method) - if first_time == 1: - first_time = 0 - keys = list(metrics.keys()) - with open(output_file, 'a') as f: - f.write('No more cheating; no taking logarithms also\n') - f.write(', '.join(['Model'] + keys) + '\n') - with open(output_file, 'a', newline='') as f: - writer = csv.writer(f) - writer.writerow([ml_model] + [metrics[key] for key in keys]) + first_time = 1 + for ml_model in ml_regressors: + ### + # For KNNR running properly X.shape[0] has been changed to len(X) + # in line 240 of + # C:\Software\Python37\Lib\site-packages\sklearn\neighbors\_regression.py + print(f"Testing models trained in {ml_model}") + metrics = test_model(ml_model, paradigm=paradigm, + testing_method=testing_method) + if first_time == 1: + first_time = 0 + keys = list(metrics.keys()) + with open(output_file, 'a') as f: + f.write('No more cheating; no taking logarithms also\n') + f.write(', '.join(['Model'] + keys) + '\n') + with open(output_file, 'a', newline='') as f: + writer = csv.writer(f) + writer.writerow([ml_model] + [metrics[key] for key in keys]) diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index a65bccf..366af7b 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -36,16 +36,13 @@ def sign(input): def create_features(degrees, variable=0, sv=False, - include_aveg_not_zero=False): - if include_aveg_not_zero: - functions = [sum, max, aveg, aveg_not_zero] - else: - functions = [sum, max, aveg] # , aveg_not_zero] + operations=[sum, max, aveg, aveg_not_zero]): sign_or_not = [identity, sign] features = [] features_names = [] - for choice in itertools.product(functions, - sign_or_not, functions, + for choice in itertools.product(operations, + sign_or_not, + operations, sign_or_not): feature_description = (choice[0].__name__ + "sign" * (choice[1].__name__ == "sign") @@ -78,8 +75,10 @@ def extract_features(dataset): all_labels.append(dataset[1][index]) all_timings.append(dataset[2][index]) all_cells.append(dataset[3][index]) - names, instance_features = features_from_set_of_polys( - original_polynomials) + names, instance_features = \ + features_from_set_of_polys( + original_polynomials, + operations=[sum, max, aveg, aveg_not_zero]) all_features.append(instance_features) my_dataset['polynomials'] = np.array(all_original_polynomials) my_dataset['names'] = np.array(names) @@ -91,23 +90,37 @@ def extract_features(dataset): return my_dataset -def features_from_set_of_polys(original_polynomials): +def features_from_set_of_polys(original_polynomials, + operations=[sum, max, aveg, aveg_not_zero]): instance_features = [] names = [] nvar = len(original_polynomials[0][0]) - 1 for var in range(nvar): - degrees = [[monomial[var] for monomial in poly] - for poly in original_polynomials] - var_features, var_features_names = create_features(degrees, - variable=var) + var_features, var_names = \ + compute_features_for_var(original_polynomials, + var, + operations=operations) instance_features += var_features - names += var_features_names - sdegrees = \ - [[sum(monomial) for monomial in poly if monomial[var] != 0] + [0] - for poly in original_polynomials] - svar_features, svar_features_names = create_features(sdegrees, - variable=var, - sv=True) - instance_features += svar_features - names += svar_features_names + names += var_names return names, instance_features + + +def compute_features_for_var(original_polynomials, var, operations): + '''Given polynomials and a variable computes the features''' + degrees = [[monomial[var] for monomial in poly] + for poly in original_polynomials] + var_features, var_features_names = \ + create_features(degrees, + variable=var, + operations=operations) + sdegrees = \ + [[sum(monomial) for monomial in poly if monomial[var] != 0] + [0] + for poly in original_polynomials] + svar_features, svar_features_names = \ + create_features(sdegrees, + variable=var, + sv=True, + operations=operations) + var_names = var_features_names + svar_features_names + var_features = var_features + svar_features + return var_features, var_names diff --git a/test_train_datasets.py b/test_train_datasets.py index 767fbc1..00da55a 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -1,3 +1,4 @@ +import numpy as np import pickle import csv import importlib.util @@ -110,9 +111,11 @@ def create_regression_datasets(taking_logarithms=True): # we will use the augmented dataset here with open(this_dataset_filename, 'rb') as this_dataset_file: regression_dataset = pickle.load(this_dataset_file) + # print("regression_dataset['timings']", len(regression_dataset['timings']), regression_dataset['timings']) regression_dataset['labels'] = \ [timings[0] for timings in regression_dataset['timings']] + # print("regression_dataset['labels']", len(regression_dataset['labels']), regression_dataset['labels']) if taking_logarithms: regression_dataset['labels'] = \ [log(label) for label @@ -121,3 +124,11 @@ def create_regression_datasets(taking_logarithms=True): find_dataset_filename(purpose, method='regression') with open(this_dataset_filename, 'wb') as this_dataset_file: pickle.dump(regression_dataset, this_dataset_file) + # classification_dataset = regression_dataset + # classification_dataset['labels'] = \ + # [np.argmin(timings) for timings + # in regression_dataset['timings']] + # print(classification_dataset['labels']) + + +create_regression_datasets(taking_logarithms=False) \ No newline at end of file diff --git a/train_models.py b/train_models.py index a344e39..190f1a9 100644 --- a/train_models.py +++ b/train_models.py @@ -1,3 +1,4 @@ +import math import pickle from yaml_tools import read_yaml_from_file from config.ml_models import sklearn_models @@ -5,9 +6,12 @@ from find_filename import find_dataset_filename from find_filename import find_hyperparams_filename from find_filename import find_model_filename +from find_filename import find_other_filename from dataset_manipulation import give_all_symmetries import numpy as np from sklearn import metrics +from itertools import combinations +from replicating_Dorians_features import compute_features_for_var def train_model(ml_model, method): @@ -16,13 +20,13 @@ def train_model(ml_model, method): with open(train_data_filename, 'rb') as train_data_file: train_dataset = pickle.load(train_data_file) hyperparams = read_yaml_from_file(hyperparams_file) - current_classifier = sklearn_models[ml_model] - clf = current_classifier(**hyperparams) - print("DATaset", train_dataset.keys()) - clf.fit(train_dataset['features'], train_dataset['labels']) + current_model = sklearn_models[ml_model] + model = current_model(**hyperparams) + # model = current_model() + model.fit(train_dataset['features'], train_dataset['labels']) trained_model_filename = find_model_filename(method, ml_model) with open(trained_model_filename, 'wb') as trained_model_file: - pickle.dump(clf, trained_model_file) + pickle.dump(model, trained_model_file) def train_regression_model(ml_model, method): @@ -38,10 +42,8 @@ def train_regression_model(ml_model, method): #### # IS THIS REALLY DOING SOMTHING? # What if we used twice timelimit instead - current_classifier = ml_regressors[ml_model] - # print(train_dataset['timings']) - print("her") - reg = current_classifier() # **hyperparams) + current_model = ml_regressors[ml_model] + reg = current_model() # **hyperparams) reg.fit(train_dataset['features'], train_dataset['timings']) # trained_model_filename = find_model_filename(method, ml_model, 'regression') # with open(trained_model_filename, 'wb') as trained_model_file: @@ -75,3 +77,73 @@ def test_regression_model(method, regressor): # regressor = train_regression_model(ml_reg, 'balanced') # print(ml_reg) # test_regression_model('balanced', regressor) + +def train_reinforcement_model(ml_model, method='Augmented'): + train_data_filename = find_dataset_filename('Train', method=method) + with open(train_data_filename, 'rb') as train_data_file: + train_dataset = pickle.load(train_data_file) + hyperparams_file = find_hyperparams_filename(method, ml_model) + hyperparams = read_yaml_from_file(hyperparams_file) + current_model = sklearn_models[ml_model] + model = current_model(**hyperparams) + for projections, timings \ + in zip(train_dataset['projections'], train_dataset['timings']): + training_features, training_labels = \ + training_instances_reinforcement(model, projections) + model.fit(training_features, training_labels) + + + +def training_instances_reinforcement(model, projections, timings): + original_polynomials = projections[0][0] + nvar = len(original_polynomials[0][0]) - 1 + vars_features = get_vars_features(original_polynomials) + evaluations = [model.predict([var_features])[0] + for var_features in vars_features] + timing = [] + for var in range(nvar): + # retruns the polynomials after projection wrt var + projected_polynomials = projections[var * math.factorial(nvar-1)][1] + new_var = var_choice_reinforcement(model, projected_polynomials) + ordering_chosen = new_var + var * math.factorial(nvar-1) + timing[var] = timings[ordering_chosen] + # now compute which part of the difference between + # evaluations[i]/evaluations[j] and timing[i]/timing[j] + # corresponds to each evaluation + instances_features = [] + instances_labels = [] + pairs = list(combinations(range(nvar), 2)) + for i, j in pairs: + correction_coefficient = \ + math.sqrt((timing[j]/timing[j])/(evaluations[i]/evaluations[j])) + instances_features += [vars_features[i], vars_features[j]] + instances_labels += [evaluations[i]*correction_coefficient, + evaluations[j]/correction_coefficient] + return instances_features, instances_labels + + +def get_vars_features(polynomials): + '''Will return the features of each variable + in the given set of polynomials''' + vars_features = [] + nvar = len(polynomials[0][0]) - 1 + unique_features_filename = find_other_filename("unique_features") + with open(unique_features_filename, 'wb') as unique_features_file: + unique_names = pickle.load(unique_features_file) + print(unique_names) + for var in range(nvar): + var_features, var_names = \ + compute_features_for_var(polynomials, var) + var_features = [feature for feature, name + in zip(var_features, var_names) + if name in unique_names] + vars_features += var_features + return vars_features + + +def var_choice_reinforcement(model, polynomials): + '''This function will return the next variable to project chosen by the model trained using reinforcement''' + vars_features = get_vars_features(model, polynomials) + evaluations = [model.predict([var_features])[0] + for var_features in vars_features] + return evaluations.index(min(evaluations))