diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index e0c076c..a600bea 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index a0cb9d7..f7adca5 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index f1bfddc..b97c71d 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 0f172bc..bdb7fc8 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/find_filename.py b/find_filename.py index ba7ec1e..2e02e8e 100644 --- a/find_filename.py +++ b/find_filename.py @@ -47,3 +47,10 @@ def find_output_filename(training_method): def find_other_filename(search): return os.path.join(os.path.dirname(__file__), 'config', f'{search}.txt') + + +import pickle +names_filename = find_other_filename('unique_names') +with open(names_filename, 'rb') as names_f: + names = pickle.load(names_f) +print(len(names), '\n', names[2], '\n', names[67], '\n', names[132]) diff --git a/from_poly_set_to_features.py b/from_poly_set_to_features.py new file mode 100644 index 0000000..9074189 --- /dev/null +++ b/from_poly_set_to_features.py @@ -0,0 +1,127 @@ +"""This file will contain the functions necessary to convert +a list of sets of polynomials to a list of their features. +This features will be unique and standarised""" +import numpy as np +import pickle +from packages.dataset_manipulation import augmentate_dataset +from find_filename import find_other_filename +from replicating_Dorians_features import features_from_set_of_polys + + +def poly_set_feature_extractor(sets_of_polys, determine_unique_features=False, + determine_standarization=False): + """Given a list of polynomial sets will return a list of its features""" + features_list = [] + for set_of_polys in sets_of_polys: + names, features = features_from_set_of_polys(set_of_polys) + features_list.append(features) + if determine_unique_features: + # if we want to find unique feature names + find_unique_features(names, features_list) + unique_names, unique_features = get_unique_features(names, features_list) + if determine_standarization: + find_standarizing_values(unique_names, unique_features) + standarized_features = get_standarized_features(unique_names, unique_features) + return names, standarized_features + + +# def features_set_of_polys(original_polynomials): +# instance_features = [] +# names = [] +# nvar = len(original_polynomials[0][0]) - 1 +# for var in range(nvar): +# degrees = [[monomial[var] for monomial in poly] +# for poly in original_polynomials] +# var_features, var_features_names = create_features(degrees, +# variable=var) +# instance_features += var_features +# names += var_features_names +# sdegrees = [[sum(monomial) for monomial in poly +# if monomial[var]!=0]+[0] +# for poly in original_polynomials] +# svar_features, svar_features_names = create_features(sdegrees, +# variable=var, +# sv=True) +# instance_features += svar_features +# names += svar_features_names +# return names, instance_features + + +def find_unique_features(names, features): + """ + Saves the name of unique features in the assigned file. + + When two features share the same value for all the instances, + or they are the same after adition or multiplication, + one of them is not considered unique. + """ + # we want to look for uniqueness after augmenting to discard + # some that might look equal + # creating labels and timing for the augmentate_dataset function + labels = [0]*len(features) + timings = [[0, 0]]*len(features) + augmented_features, _, _ = augmentate_dataset(features, labels, timings) + # now we look for the unique features + unique_features = [] + unique_names = [] + for index, feature in enumerate(zip(*augmented_features)): + if (any([np.array_equal(feature, ex_feature) + for ex_feature in unique_features]) + or np.std(feature) == 0): + # check if this feature has been already recorded + pass + elif feature.count(feature[0]) == len(feature): + # check if it is a constant list + pass + else: + # if none of the previous conditions then + unique_features.append(feature) + unique_names.append(names[index]) + unique_names_filename = find_other_filename('unique_names') + with open(unique_names_filename, 'wb') as unique_names_file: + pickle.dump(unique_names, unique_names_file) + + +def get_unique_features(names, features): + """Return the features corresponding to a name in 'unique_names'.""" + # We recover the list of unique feature names + unique_names_filename = find_other_filename('unique_names') + with open(unique_names_filename, 'rb') as unique_names_file: + unique_names = pickle.load(unique_names_file) + # we keep only the features that are unique + unique_features = [] + index = 0 + for feature in zip(*features): + if names[index] in unique_names: + unique_features.append(feature) + index += 1 + return unique_names, np.transpose(unique_features) + + +def find_standarizing_values(names, features_list): + """Finds and saves the mean and std of the different features + so that features can be standarised in a consistent way + before giving them to the machine learning models""" + standarizing_values = dict() + for name, features in zip(names, features_list): + standarizing_values[name] = (np.mean(features), np.std(features)) + standarizing_values_filename = find_other_filename('standarizing_values') + with open(standarizing_values_filename, 'wb') as standarizing_values_file: + pickle.dump(standarizing_values, standarizing_values_file) + + +def get_standarized_features(names, features): + """Returns the standarised features.""" + # We recover the list of unique feature names + standarizing_values_filename = find_other_filename('standarizing_values') + with open(standarizing_values_filename, 'rb') as standarizing_values_file: + standarizing_values = pickle.load(standarizing_values_file) + # we keep only the features that are unique + standarized_features = [] + # for featurex in zip(*features): + # print(type(featurex), len(features)) + index = 0 + for index, feature in enumerate(zip(*features)): + mean, std = standarizing_values[names[index]] + standarized_features.append((feature-mean)/std) + return np.transpose(standarized_features) diff --git a/main.py b/main.py index 8b234df..14355ed 100644 --- a/main.py +++ b/main.py @@ -30,7 +30,7 @@ # Hyperparameter tuning take a very long time, # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned -tune_hyperparameters = True +tune_hyperparameters = False paradigm = 'classification' # cleaning_dataset() @@ -41,11 +41,11 @@ for method in dataset_qualities: print(f"Choosing hyperparameters for {ml_model} in {method}") choose_hyperparams(ml_model, method) -for ml_model in ml_models: - print(f"Training {ml_model}") - for method in dataset_qualities: - print(f"for {method}") - train_model(ml_model, method) +# for ml_model in ml_models: +# print(f"Training {ml_model}") +# for method in dataset_qualities: +# print(f"for {method}") +# train_model(ml_model, method) training_method = 'augmented' testing_method = 'augmented' first_time = 1 diff --git a/main_heuristics.py b/main_heuristics.py index 339aa9f..fc5730a 100644 --- a/main_heuristics.py +++ b/main_heuristics.py @@ -2,33 +2,61 @@ import math import pickle import random -import numpy as np -from Heuristics.heuristics_guess import not_greedy_heuristic_guess -from Heuristics.heuristics_guess import choose_order_given_projections +# import numpy as np +# from Heuristics.heuristics_guess import not_greedy_heuristic_guess +# from Heuristics.heuristics_guess import choose_order_given_projections from find_filename import find_dataset_filename from test_models import compute_metrics nvar = 3 -testing_method = 'Normal' +testing_method = 'Augmented' test_dataset_filename = find_dataset_filename('Test', testing_method) with open(test_dataset_filename, 'rb') as test_dataset_file: testing_dataset = pickle.load(test_dataset_file) output_file = "heuristics_output_acc_time.csv" + +# TESTING GMODS IN AUUGMENTED : Features 2, 67 and 132 +def choose_gmods(features): + a = [] + # print(features) + a.append(features[2]) + a.append(features[67]) + a.append(features[132]) + if a[0]==min(a): + if a[1]<=a[2]: + return 0 + else: + return 1 + elif a[1]==min(a): + if a[0]<=a[2]: + return 2 + else: + return 3 + elif a[2]==min(a): + if a[0]<=a[1]: + return 4 + else: + return 5 + # Testing in heuristics that make all the choice at once first_heuristic = 1 -for heuristic in ['gmods', 'brown', 'random', 'virtual best']: - reps = 100 +# for heuristic in ['T1', 'gmods', 'brown', 'random', 'virtual best']: +for heuristic in ['gmods', 'virtual best']: + reps = 10 sum_metrics = dict() for i in range(reps): if heuristic == 'virtual best': - chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] + # chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] + chosen_indices = testing_dataset['labels'] elif heuristic == 'random': chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] else: - chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic) - for projection in testing_dataset['projections']] + # chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic) + # for projection in testing_dataset['projections']] + chosen_indices = [choose_gmods(features) + for features in testing_dataset['features']] metrics = compute_metrics(chosen_indices, testing_dataset['labels'], testing_dataset['timings'], @@ -38,8 +66,8 @@ else: sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} - augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} - + augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(1)*aveg_metrics[key] for key in sum_metrics} + print(heuristic, augmented_metrics) if first_heuristic == 1: first_heuristic = 0 @@ -51,37 +79,37 @@ writer = csv.writer(f) writer.writerow([heuristic] + [augmented_metrics[key] for key in keys]) -# Testing on greedy heuristics -for heuristic in ['brown', 'gmods', 'random', 'virtual best']: - reps = 100 - sum_metrics = dict() - for i in range(reps): - if heuristic == 'virtual best': - chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] - elif heuristic == 'random': - chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] - else: - chosen_indices = [choose_order_given_projections(projection, heuristic) - for projection in testing_dataset['projections']] - metrics = compute_metrics(chosen_indices, - testing_dataset['labels'], - testing_dataset['timings'], - testing_dataset['cells']) - if len(sum_metrics) == 0: - sum_metrics = metrics - else: - sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} - aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} - augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} +# # Testing on greedy heuristics +# for heuristic in ['brown', 'gmods', 'random', 'virtual best']: +# reps = 100 +# sum_metrics = dict() +# for i in range(reps): +# if heuristic == 'virtual best': +# chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] +# elif heuristic == 'random': +# chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] +# else: +# chosen_indices = [choose_order_given_projections(projection, heuristic) +# for projection in testing_dataset['projections']] +# metrics = compute_metrics(chosen_indices, +# testing_dataset['labels'], +# testing_dataset['timings'], +# testing_dataset['cells']) +# if len(sum_metrics) == 0: +# sum_metrics = metrics +# else: +# sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} +# aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} +# augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} - print(heuristic, augmented_metrics) - if first_heuristic == 1: - first_heuristic = 0 - keys = list(augmented_metrics.keys()) - with open(output_file, 'a') as f: - f.write('Now choosing greedily \n') - f.write(', '.join(['Model'] + keys) + '\n') - with open(output_file, 'a', newline='') as f: - writer = csv.writer(f) - writer.writerow([heuristic] + [augmented_metrics[key] for key in keys]) -# print(sum(min(timings) for timings in testing_dataset['timings'])) +# print(heuristic, augmented_metrics) +# if first_heuristic == 1: +# first_heuristic = 0 +# keys = list(augmented_metrics.keys()) +# with open(output_file, 'a') as f: +# f.write('Now choosing greedily \n') +# f.write(', '.join(['Model'] + keys) + '\n') +# with open(output_file, 'a', newline='') as f: +# writer = csv.writer(f) +# writer.writerow([heuristic] + [augmented_metrics[key] for key in keys]) +# # print(sum(min(timings) for timings in testing_dataset['timings'])) diff --git a/main_regression.py b/main_regression.py index 4f36752..316fa8c 100644 --- a/main_regression.py +++ b/main_regression.py @@ -26,13 +26,13 @@ # Hyperparameter tuning take a very long time, # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned -tune_hyperparameters = True +tune_hyperparameters = False taking_logarithms = False for i in range(1): # cleaning_dataset() # create_train_test_datasets() - # create_regression_datasets(taking_logarithms=taking_logarithms) + create_regression_datasets(taking_logarithms=taking_logarithms) paradigm = "regression" if tune_hyperparameters: @@ -62,7 +62,7 @@ first_time = 0 keys = list(metrics.keys()) with open(output_file, 'a') as f: - f.write('No more cheating; no taking logarithms also\n') + f.write('After changing dataset\n') f.write(', '.join(['Model'] + keys) + '\n') with open(output_file, 'a', newline='') as f: writer = csv.writer(f) diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py index 4e18fa8..72fd25a 100644 --- a/packages/dataset_manipulation/dataset_manipulation.py +++ b/packages/dataset_manipulation/dataset_manipulation.py @@ -4,35 +4,53 @@ import random from .exploit_symmetries import give_all_symmetries from .exploit_symmetries import augmentate_timings +from itertools import permutations # from sklearn.preprocessing import normalize nvar = 3 -def augmentate_dataset(features, targets, timings, cells): +def augmentate_instance(features, timings, cells, nvar): + variables = list(range(nvar)) + split_features = [features[i*len(features)//nvar:(i+1)*len(features)//nvar] + for i in range(nvar)] + dict_timings = {str(perm): timing for perm, timing + in zip(permutations(variables), timings)} + dict_cells = {str(perm): cell for perm, cell in zip(permutations(variables), cells)} + augmented_features, augmented_timings, augmented_cells = [], [], [] + for perm in permutations(variables): + augmented_features.append([feature for i in perm + for feature in split_features[i]]) + augmented_timings.append([dict_timings[str(double_perm)] + for double_perm in permutations(perm)]) + augmented_cells.append([dict_cells[str(double_perm)] + for double_perm in permutations(perm)]) + return augmented_features, augmented_timings, augmented_cells + + + +def augmentate_dataset(all_features, all_timings, all_cells, nvar): """ - Multiply the size of the dataset by 6. + Multiply the size of the dataset by math.factorial(nvar). Arguments: features: list(list(numpy.float)) targets: list(numpy.float) """ - symmetric_features = [] - symmetric_targets = [] - symmetric_timings = [] - symmetric_cells = [] - for features, target, timing, cell in \ - zip(features, targets, timings, cells): - symmetric_features += give_all_symmetries(features, int(target)) - symmetric_targets += list(range(math.factorial(nvar))) - symmetric_timings += augmentate_timings(timing, int(target)) - symmetric_cells += augmentate_timings(cell, int(target)) - - return np.array(symmetric_features), np.array(symmetric_targets), \ - np.array(symmetric_timings), np.array(symmetric_cells) - - -def balance_dataset(features, targets, timings, cells): + augmented_features = [] + augmented_timings = [] + augmented_cells = [] + for features, timings, cells in \ + zip(all_features, all_timings, all_cells): + new_features, new_timings, new_cells = \ + augmentate_instance(features, timings, cells, nvar) + augmented_features += new_features + augmented_timings += new_timings + augmented_cells += new_cells + return augmented_features, augmented_timings, augmented_cells + + +def balance_dataset(all_features, all_timings, all_cells, nvar): """ Balance the dataset so all targets are almost equally common. @@ -41,21 +59,22 @@ def balance_dataset(features, targets, timings, cells): targets: list(numpy.float) """ balanced_features = [] - balanced_targets = [] balanced_timings = [] balanced_cells = [] - for features, target, timing, cell in \ - zip(features, targets, timings, cells): - symmetric_features = give_all_symmetries(features, int(target)) - symmetric_timings = augmentate_timings(timing, int(target)) - symmetric_cells = augmentate_timings(cell, int(target)) + for features, timings, cells in \ + zip(all_features, all_timings, all_cells): new_target = random.choice(list(range(math.factorial(nvar)))) - balanced_features.append(symmetric_features[new_target]) - balanced_targets.append(new_target) - balanced_timings.append(symmetric_timings[new_target]) - balanced_cells.append(symmetric_cells[new_target]) - return np.array(balanced_features), np.array(balanced_targets),\ - np.array(balanced_timings), np.array(balanced_cells) + new_features, new_timings, new_cells = \ + augmentate_instance(features, timings, cells, nvar) + balanced_features.append(new_features[new_target]) + balanced_timings.append(new_timings[new_target]) + balanced_cells.append(new_cells[new_target]) + return balanced_features, balanced_timings, balanced_cells + +# features = [1,2,3,4,5,6] +# timings = [10,20,30,40,50,60] +# cells = [21,32,43,54,65,76] +# print(balance_dataset([features], [timings], [cells], 3)) def name_unique_features(names, features): diff --git a/test_models.py b/test_models.py index 55de50d..653642f 100644 --- a/test_models.py +++ b/test_models.py @@ -1,18 +1,20 @@ import csv +import math import pickle import importlib.util import numpy as np from sklearn import metrics from config.general_values import dataset_qualities from config.ml_models import ml_models +from config.ml_models import ml_regressors from find_filename import find_output_filename from find_filename import find_dataset_filename from find_filename import find_model_filename # Check if 'dataset_manipulation' is installed if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): - from exploit_symmetries import give_all_symmetries + from dataset_manipulation import augmentate_instance else: - from packages.dataset_manipulation.exploit_symmetries import give_all_symmetries + from packages.dataset_manipulation.dataset_manipulation import augmentate_instance # def test_model(trained_model_filename, test_dataset_filename): @@ -104,8 +106,15 @@ def test_model(ml_model, paradigm, testing_method='augmented'): model = pickle.load(trained_model_file) with open(test_dataset_filename, 'rb') as test_dataset_file: testing_dataset = pickle.load(test_dataset_file) - chosen_indices = [return_regressor_choice(model, features) - for features in testing_dataset['features']] + print("here") + if ml_model in ml_regressors: + chosen_indices = [return_regressor_choice(model, features) + for features in testing_dataset['features']] + else: + chosen_indices = [model.predict([features])[0] + for features in testing_dataset['features']] + print(chosen_indices) + print("here2") return compute_metrics(chosen_indices, testing_dataset['labels'], testing_dataset['timings'], @@ -123,6 +132,7 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells): zip(chosen_indices, labels, all_timings, all_cells): if chosen_index == label: correct += 1 + print(timings, chosen_index) if timings[chosen_index] not in [30, 60]: metrics['Completed'] += 1 metrics['Total time'] += timings[chosen_index] @@ -135,11 +145,17 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells): def return_regressor_choice(model, features): - features_all_symmetries = give_all_symmetries(features) + nvar = 3 ## Make this better + made_up_timings = list(range(math.factorial(nvar))) + made_up_cells = list(range(math.factorial(nvar))) + augmentated_features, _, _ = \ + augmentate_instance(features, made_up_timings, made_up_cells, nvar) y_op = float('inf') - for index, x_features in enumerate(features_all_symmetries): - # print(x_features) + for index, x_features in enumerate(augmentated_features): y_pred = model.predict([x_features]) + ######## + # THIS IS NOT A LIST?? + ######## # print(y_pred) if y_op > y_pred: y_op = y_pred diff --git a/test_train_datasets.py b/test_train_datasets.py index 00da55a..fb3bf71 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -57,20 +57,24 @@ def create_train_test_datasets(): dataset['cells'], test_size=0.20, random_state=random_state) - keys = ['features', 'labels', 'timings', 'cells'] + keys = ['features', 'timings', 'cells'] for purpose in purposes: datasets[f'{purpose}_Balanced'] = \ {key: elem for key, elem in zip(keys, balance_dataset( *[datasets[f'{purpose}_Normal'][key2] - for key2 in keys])) + for key2 in keys], nvar=3)) ##CHOOSE NVAR WELL } + datasets[f'{purpose}_Balanced']['labels'] = \ + [timings.index(min(timings)) for timings in datasets[f'{purpose}_Balanced']['timings']] datasets[f'{purpose}_Augmented'] = \ {key: elem for key, elem in zip(keys, augmentate_dataset( *[datasets[f'{purpose}_Normal'][key2] - for key2 in keys])) + for key2 in keys], nvar=3)) } + datasets[f'{purpose}_Augmented']['labels'] = \ + [timings.index(min(timings)) for timings in datasets[f'{purpose}_Augmented']['timings']] for purpose in purposes: for quality in dataset_qualities: this_dataset_filename = \ diff --git a/train_models.py b/train_models.py index 190f1a9..7e73060 100644 --- a/train_models.py +++ b/train_models.py @@ -48,11 +48,6 @@ def train_regression_model(ml_model, method): # trained_model_filename = find_model_filename(method, ml_model, 'regression') # with open(trained_model_filename, 'wb') as trained_model_file: # pickle.dump(reg, trained_model_file) - print("Real") - print(train_dataset['timings'][10:20]) - print("Predicted") - print(reg.predict(train_dataset['features'])[10:20]) - print(metrics.mean_squared_error(reg.predict(train_dataset['features']), train_dataset['timings'])) return reg @@ -130,7 +125,6 @@ def get_vars_features(polynomials): unique_features_filename = find_other_filename("unique_features") with open(unique_features_filename, 'wb') as unique_features_file: unique_names = pickle.load(unique_features_file) - print(unique_names) for var in range(nvar): var_features, var_names = \ compute_features_for_var(polynomials, var)