diff --git a/create_clean_dataset.py b/create_clean_dataset.py index bacf348..2851bda 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -2,7 +2,6 @@ the sets of polynomials and its timings for each order, creates a dataset containing a set of unique features and its class""" -import os import pickle import numpy as np from replicating_Dorians_features import extract_features @@ -12,6 +11,7 @@ else: from packages.dataset_manipulation import remove_notunique_features from from_poly_set_to_features import poly_set_feature_extractor +from find_filename import find_dataset_filename def create_dataframe(dataset): @@ -22,9 +22,10 @@ def create_dataframe(dataset): for index, all_projections in enumerate(dataset[0]): original_polynomials = all_projections[0][0] all_original_polynomials.append(original_polynomials) - names, all_features = poly_set_feature_extractor(all_original_polynomials, - determine_standarization=True, - determine_unique_features=True) + names, all_features =\ + poly_set_feature_extractor(all_original_polynomials, + determine_standarization=True, + determine_unique_features=True) return np.array(all_original_polynomials), np.array(names),\ np.array(all_features), np.array(all_targets), np.array(all_timings) @@ -34,14 +35,17 @@ def create_dataframe(dataset): # 'dataset_without_repetition_return_ncells.txt') # with open(dataset_filename, 'rb') as f: # dataset = pickle.load(f) -# original_polys_list, names, features_list, targets_list, timings_list = create_dataframe(dataset) +# original_polys_list, names, features_list, targets_list, timings_list =\ +# create_dataframe(dataset) -def cleaning_dataset(dataset_filename, clean_dataset_filename): +def cleaning_dataset(): + dataset_filename = find_dataset_filename('unclean') + clean_dataset_filename = find_dataset_filename('clean') with open(dataset_filename, 'rb') as f: dataset = pickle.load(f) - original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset) - + original_polys_list, names, features_list, targets_list, timings_list =\ + extract_features(dataset) # working with raw features features = np.array(features_list) unique_names, unique_features = remove_notunique_features(names, features) @@ -54,7 +58,6 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename): unique_features, targets, timings), clean_dataset_file) - # dataset_filename = os.path.join(os.path.dirname(__file__), # 'DatasetsBeforeProcessing', # 'dataset_without_repetition_return_ncells.txt') diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv index ea0d247..4a221f9 100644 --- a/datasets/dataset_instances.csv +++ b/datasets/dataset_instances.csv @@ -1,7 +1,7 @@ dataset,zero,one,two,three,four,five,total train normal dataset,326,74,105,41,163,106,815 -train balanced dataset,118,136,125,149,134,153,815 +train balanced dataset,146,120,132,150,125,142,815 train augmented dataset,815,815,815,815,815,815,4890 test normal dataset,80,19,30,10,39,26,204 -test balanced dataset,39,32,36,29,31,37,204 +test balanced dataset,35,42,33,39,28,27,204 test augmented dataset,204,204,204,204,204,204,1224 diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index a31e8f4..b4f1231 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index f289c71..8c0b472 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index f75da24..9ba5aac 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 248a130..354deec 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/main.py b/main.py index b0afcae..8149b2e 100644 --- a/main.py +++ b/main.py @@ -26,29 +26,33 @@ # Hyperparameter tuning take a very long time, # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned -tune_hyperparameters = False - -original_dataset_file = find_dataset_filename('unclean') -clean_dataset_filename = find_dataset_filename('clean') -cleaning_dataset(original_dataset_file, clean_dataset_filename) -create_train_test_datasets() - -if tune_hyperparameters: - for ml_model in ml_models: - for method in dataset_types: - print(f"Choosing hyperparameters for {ml_model} in {method}") - choose_hyperparams(ml_model, method) -for ml_model in ml_models: - print(f"Training {ml_model}") - for method in dataset_types: - print(f"for {method}") - train_model(ml_model, method) -for training_method in dataset_types: - print(f"Testing models trained in {training_method}") - test_results(training_method) +# tune_hyperparameters = False + + +# cleaning_dataset() +# create_train_test_datasets() +# if tune_hyperparameters: +# for ml_model in ml_models: +# for method in dataset_types: +# print(f"Choosing hyperparameters for {ml_model} in {method}") +# choose_hyperparams(ml_model, method) +# for ml_model in ml_models: +# print(f"Training {ml_model}") +# for method in dataset_types: +# print(f"for {method}") +# train_model(ml_model, method) +# for training_method in dataset_types: +# print(f"Testing models trained in {training_method}") +# test_results(training_method) + +timings = dict() model = 'SVC' testing_method = 'Augmented' for training_method in dataset_types: print(f"Testing models trained in {training_method}") - print(timings_in_test(model, testing_method, training_method)) + timings[training_method] = timings_in_test(model, testing_method, training_method) + +from make_plots import survival_plot + +survival_plot(timings) \ No newline at end of file diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py index 008b365..b92263e 100644 --- a/packages/dataset_manipulation/dataset_manipulation.py +++ b/packages/dataset_manipulation/dataset_manipulation.py @@ -3,6 +3,7 @@ import math import random from .exploit_symmetries import give_all_symmetries +from .exploit_symmetries import augmentate_timings # from sklearn.preprocessing import normalize nvar = 3 @@ -22,7 +23,8 @@ def augmentate_dataset(features, targets, timings): for features, target, timing in zip(features, targets, timings): symmetric_features += give_all_symmetries(features, int(target)) symmetric_targets += list(range(math.factorial(nvar))) - symmetric_timings += list(timing) + symmetric_timings += augmentate_timings(timing, int(target)) + return np.array(symmetric_features), np.array(symmetric_targets), \ np.array(symmetric_timings) @@ -40,11 +42,11 @@ def balance_dataset(features, targets, timings): balanced_timings = [] for features, target, timing in zip(features, targets, timings): symmetric_features = give_all_symmetries(features, int(target)) - possible_targets = list(range(math.factorial(nvar))) - new_target = random.choice(possible_targets) + symmetric_timings = augmentate_timings(timing, int(target)) + new_target = random.choice(list(range(math.factorial(nvar)))) balanced_features.append(symmetric_features[new_target]) balanced_targets.append(new_target) - balanced_timings.append(timing[new_target]) + balanced_timings.append(symmetric_timings[new_target]) return np.array(balanced_features), np.array(balanced_targets),\ np.array(balanced_timings) @@ -88,10 +90,10 @@ def get_unique_feature_names(unique_names, names, features): return np.transpose(unique_features) -def remove_notunique_features(names, features): +def remove_notunique_features(names, features, nvar=3): # creating some targets and timing because the function requires them targets = [0]*len(features) - timings = [[0, 0]]*len(features) + timings = [list(range(math.factorial(nvar)))]*len(features) augmented_features, _, _ = augmentate_dataset(features, targets, timings) # normalized_augmented_features = normalize(augmented_features) unique_names = name_unique_features(names, augmented_features) diff --git a/packages/dataset_manipulation/exploit_symmetries.py b/packages/dataset_manipulation/exploit_symmetries.py index 4f41a3c..3eea595 100644 --- a/packages/dataset_manipulation/exploit_symmetries.py +++ b/packages/dataset_manipulation/exploit_symmetries.py @@ -14,32 +14,35 @@ """ from itertools import permutations -nvar = 3 -variables = list(range(nvar)) -perms = [list(elem) for elem in permutations(variables)] +def get_perms(variables): + perms = [list(elem) for elem in permutations(variables)] + return perms -def features_to_canonical_target(features, optimal_ordering): + +def features_to_canonical_target(features, optimal_ordering, nvar=3): """ Reorder the features for the target to be '1'. This is done by reordering the features according to the optimal variable ordering of the set of polynomials. """ - variable_orderings = perms[optimal_ordering] + perms = get_perms(list(range(nvar))) + best_variable_ordering = perms[optimal_ordering] nfeatures = len(features) split_features = [features[int(var*nfeatures/nvar): int((var+1)*nfeatures/nvar)] for var in range(nvar)] - ordered_features = [split_features[variable_orderings[i]] + ordered_features = [split_features[best_variable_ordering[i]] for i in range(nvar)] return ordered_features -def give_all_symmetries(features, optimal_ordering): +def give_all_symmetries(features, optimal_ordering, nvar=3): """Reorder the features for all possible targets. Returns a list of of all symmetries, the first one corresponding to the optimal ordering""" + perms = get_perms(list(range(nvar))) ordered_features = features_to_canonical_target(features, optimal_ordering) all_symmetries = [] @@ -51,3 +54,35 @@ def give_all_symmetries(features, optimal_ordering): for elem in lst] all_symmetries.append(flatten_new_order_features) return all_symmetries + + +def augmentate_timings(timings, optimal_ordering, nvar=3): + """Given all the timings returns a list of all the possible reorderings + so that the first reordering corresponds to the optimal ordering and + the others follow that""" + perms = get_perms(list(range(nvar))) + best_variable_ordering = perms[optimal_ordering] + new_perms = get_perms(best_variable_ordering) + all_timings = [] + for perm in new_perms: + # compute in which index this perm used to be + perm_index = perms.index(perm) + # find associated timing and append + all_timings.append(reorder_timings(timings, perm_index, nvar=3)) + return all_timings + + +def reorder_timings(timings, first_ordering, nvar=3): + """Given all the timings reorder them so that the first one + corresponds to first_ordering and the rest from the usual + permutations done from it""" + perms = get_perms(list(range(nvar))) + first_variable_ordering = perms[first_ordering] + new_perms = get_perms(first_variable_ordering) + new_timings = [] + for perm in new_perms: + # compute in which index this perm used to be + perm_index = perms.index(perm) + # find associated timing and append + new_timings.append(timings[perm_index]) + return new_timings diff --git a/test_train_datasets.py b/test_train_datasets.py index c2b2a86..04d9509 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -24,6 +24,9 @@ def create_train_test_datasets(): with open(clean_dataset_filename, 'rb') as clean_dataset_file: _, names, features, targets, timings = pickle.load(clean_dataset_file) unique_names, unique_features = remove_notunique_features(names, features) + # features were already unique because of create_clean_dataset + # decide where to remove the features + print("create_train_test", timings) unique_features_filename = find_other_filename("unique_features") with open(unique_features_filename, 'wb') as unique_features_file: pickle.dump(unique_features_filename, unique_features_file) @@ -35,6 +38,7 @@ def create_train_test_datasets(): x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings, test_size=0.20, random_state=random_state) + for purpose in ['train', 'test']: x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])