Now the times in the dataset are returned correctly

delriot · Jun 21, 2023 · 57adcb1 · 57adcb1
1 parent d42164a
commit 57adcb1
Showing 10 changed files with 93 additions and 45 deletions.
diff --git a/create_clean_dataset.py b/create_clean_dataset.py
@@ -2,7 +2,6 @@
 the sets of polynomials and its timings for each order, creates a dataset
 containing a set of unique features and its class"""
 
-import os
 import pickle
 import numpy as np
 from replicating_Dorians_features import extract_features
@@ -12,6 +11,7 @@
 else:
     from packages.dataset_manipulation import remove_notunique_features
 from from_poly_set_to_features import poly_set_feature_extractor
+from find_filename import find_dataset_filename
 
 
 def create_dataframe(dataset):
@@ -22,9 +22,10 @@ def create_dataframe(dataset):
     for index, all_projections in enumerate(dataset[0]):
         original_polynomials = all_projections[0][0]
         all_original_polynomials.append(original_polynomials)
-    names, all_features = poly_set_feature_extractor(all_original_polynomials,
-                                                     determine_standarization=True,
-                                                     determine_unique_features=True)
+    names, all_features =\
+        poly_set_feature_extractor(all_original_polynomials,
+                                   determine_standarization=True,
+                                   determine_unique_features=True)
     return np.array(all_original_polynomials), np.array(names),\
         np.array(all_features), np.array(all_targets), np.array(all_timings)
 
@@ -34,14 +35,17 @@ def create_dataframe(dataset):
 #                                 'dataset_without_repetition_return_ncells.txt')
 # with open(dataset_filename, 'rb') as f:
 #     dataset = pickle.load(f)
-# original_polys_list, names, features_list, targets_list, timings_list = create_dataframe(dataset)
+# original_polys_list, names, features_list, targets_list, timings_list =\
+#     create_dataframe(dataset)
 
 
-def cleaning_dataset(dataset_filename, clean_dataset_filename):
+def cleaning_dataset():
+    dataset_filename = find_dataset_filename('unclean')
+    clean_dataset_filename = find_dataset_filename('clean')
     with open(dataset_filename, 'rb') as f:
         dataset = pickle.load(f)
-    original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)
-
+    original_polys_list, names, features_list, targets_list, timings_list =\
+        extract_features(dataset)
     # working with raw features
     features = np.array(features_list)
     unique_names, unique_features = remove_notunique_features(names, features)
@@ -54,7 +58,6 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename):
                                unique_features, targets, timings),
                               clean_dataset_file)
 
-
 # dataset_filename = os.path.join(os.path.dirname(__file__),
 #                                 'DatasetsBeforeProcessing',
 #                                 'dataset_without_repetition_return_ncells.txt')

diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv
@@ -1,7 +1,7 @@
 dataset,zero,one,two,three,four,five,total
 train normal dataset,326,74,105,41,163,106,815
-train balanced dataset,118,136,125,149,134,153,815
+train balanced dataset,146,120,132,150,125,142,815
 train augmented dataset,815,815,815,815,815,815,4890
 test normal dataset,80,19,30,10,39,26,204
-test balanced dataset,39,32,36,29,31,37,204
+test balanced dataset,35,42,33,39,28,27,204
 test augmented dataset,204,204,204,204,204,204,1224

diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt
diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt
diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt
diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt
diff --git a/main.py b/main.py
@@ -26,29 +26,33 @@
 # Hyperparameter tuning take a very long time,
 # if tune_hyperparameters is used to decide whether to tune them
 # or to used previously tuned
-tune_hyperparameters = False
-
-original_dataset_file = find_dataset_filename('unclean')
-clean_dataset_filename = find_dataset_filename('clean')
-cleaning_dataset(original_dataset_file, clean_dataset_filename)
-create_train_test_datasets()
-
-if tune_hyperparameters:
-    for ml_model in ml_models:
-        for method in dataset_types:
-            print(f"Choosing hyperparameters for {ml_model} in {method}")
-            choose_hyperparams(ml_model, method)
-for ml_model in ml_models:
-    print(f"Training {ml_model}")
-    for method in dataset_types:
-        print(f"for {method}")
-        train_model(ml_model, method)
-for training_method in dataset_types:
-    print(f"Testing models trained in {training_method}")
-    test_results(training_method)
+# tune_hyperparameters = False
+
+
+# cleaning_dataset()
+# create_train_test_datasets()
 
+# if tune_hyperparameters:
+#     for ml_model in ml_models:
+#         for method in dataset_types:
+#             print(f"Choosing hyperparameters for {ml_model} in {method}")
+#             choose_hyperparams(ml_model, method)
+# for ml_model in ml_models:
+#     print(f"Training {ml_model}")
+#     for method in dataset_types:
+#         print(f"for {method}")
+#         train_model(ml_model, method)
+# for training_method in dataset_types:
+#     print(f"Testing models trained in {training_method}")
+#     test_results(training_method)
+
+timings = dict()
 model = 'SVC'
 testing_method = 'Augmented'
 for training_method in dataset_types:
     print(f"Testing models trained in {training_method}")
-    print(timings_in_test(model, testing_method, training_method))
+    timings[training_method] = timings_in_test(model, testing_method, training_method)
+
+from make_plots import survival_plot
+
+survival_plot(timings)
diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py
@@ -3,6 +3,7 @@
 import math
 import random
 from .exploit_symmetries import give_all_symmetries
+from .exploit_symmetries import augmentate_timings
 # from sklearn.preprocessing import normalize
 
 nvar = 3
@@ -22,7 +23,8 @@ def augmentate_dataset(features, targets, timings):
     for features, target, timing in zip(features, targets, timings):
         symmetric_features += give_all_symmetries(features, int(target))
         symmetric_targets += list(range(math.factorial(nvar)))
-        symmetric_timings += list(timing)
+        symmetric_timings += augmentate_timings(timing, int(target))
+
     return np.array(symmetric_features), np.array(symmetric_targets), \
         np.array(symmetric_timings)
 
@@ -40,11 +42,11 @@ def balance_dataset(features, targets, timings):
     balanced_timings = []
     for features, target, timing in zip(features, targets, timings):
         symmetric_features = give_all_symmetries(features, int(target))
-        possible_targets = list(range(math.factorial(nvar)))
-        new_target = random.choice(possible_targets)
+        symmetric_timings = augmentate_timings(timing, int(target))
+        new_target = random.choice(list(range(math.factorial(nvar))))
         balanced_features.append(symmetric_features[new_target])
         balanced_targets.append(new_target)
-        balanced_timings.append(timing[new_target])
+        balanced_timings.append(symmetric_timings[new_target])
     return np.array(balanced_features), np.array(balanced_targets),\
         np.array(balanced_timings)
 
@@ -88,10 +90,10 @@ def get_unique_feature_names(unique_names, names, features):
     return np.transpose(unique_features)
 
 
-def remove_notunique_features(names, features):
+def remove_notunique_features(names, features, nvar=3):
     # creating some targets and timing because the function requires them
     targets = [0]*len(features)
-    timings = [[0, 0]]*len(features)
+    timings = [list(range(math.factorial(nvar)))]*len(features)
     augmented_features, _, _ = augmentate_dataset(features, targets, timings)
     # normalized_augmented_features = normalize(augmented_features)
     unique_names = name_unique_features(names, augmented_features)

diff --git a/packages/dataset_manipulation/exploit_symmetries.py b/packages/dataset_manipulation/exploit_symmetries.py
@@ -14,32 +14,35 @@
 """
 from itertools import permutations
 
-nvar = 3
-variables = list(range(nvar))
-perms = [list(elem) for elem in permutations(variables)]
 
+def get_perms(variables):
+    perms = [list(elem) for elem in permutations(variables)]
+    return perms
 
-def features_to_canonical_target(features, optimal_ordering):
+
+def features_to_canonical_target(features, optimal_ordering, nvar=3):
     """
     Reorder the features for the target to be '1'.
 
     This is done by reordering the features according to the optimal variable
     ordering of the set of polynomials.
     """
-    variable_orderings = perms[optimal_ordering]
+    perms = get_perms(list(range(nvar)))
+    best_variable_ordering = perms[optimal_ordering]
     nfeatures = len(features)
     split_features = [features[int(var*nfeatures/nvar):
                                int((var+1)*nfeatures/nvar)]
                       for var in range(nvar)]
-    ordered_features = [split_features[variable_orderings[i]]
+    ordered_features = [split_features[best_variable_ordering[i]]
                         for i in range(nvar)]
     return ordered_features
 
 
-def give_all_symmetries(features, optimal_ordering):
+def give_all_symmetries(features, optimal_ordering, nvar=3):
     """Reorder the features for all possible targets.
     Returns a list of of all symmetries, the first one
     corresponding to the optimal ordering"""
+    perms = get_perms(list(range(nvar)))
     ordered_features = features_to_canonical_target(features,
                                                     optimal_ordering)
     all_symmetries = []
@@ -51,3 +54,35 @@ def give_all_symmetries(features, optimal_ordering):
                                       for elem in lst]
         all_symmetries.append(flatten_new_order_features)
     return all_symmetries
+
+
+def augmentate_timings(timings, optimal_ordering, nvar=3):
+    """Given all the timings returns a list of all the possible reorderings
+    so that the first reordering corresponds to the optimal ordering and
+    the others follow that"""
+    perms = get_perms(list(range(nvar)))
+    best_variable_ordering = perms[optimal_ordering]
+    new_perms = get_perms(best_variable_ordering)
+    all_timings = []
+    for perm in new_perms:
+        # compute in which index this perm used to be
+        perm_index = perms.index(perm)
+        # find associated timing and append
+        all_timings.append(reorder_timings(timings, perm_index, nvar=3))
+    return all_timings
+
+
+def reorder_timings(timings, first_ordering, nvar=3):
+    """Given all the timings reorder them so that the first one
+    corresponds to first_ordering and the rest from the usual
+    permutations done from it"""
+    perms = get_perms(list(range(nvar)))
+    first_variable_ordering = perms[first_ordering]
+    new_perms = get_perms(first_variable_ordering)
+    new_timings = []
+    for perm in new_perms:
+        # compute in which index this perm used to be
+        perm_index = perms.index(perm)
+        # find associated timing and append
+        new_timings.append(timings[perm_index])
+    return new_timings
diff --git a/test_train_datasets.py b/test_train_datasets.py
@@ -24,6 +24,9 @@ def create_train_test_datasets():
     with open(clean_dataset_filename, 'rb') as clean_dataset_file:
         _, names, features, targets, timings = pickle.load(clean_dataset_file)
     unique_names, unique_features = remove_notunique_features(names, features)
+    # features were already unique because of create_clean_dataset
+    # decide where to remove the features
+    print("create_train_test", timings)
     unique_features_filename = find_other_filename("unique_features")
     with open(unique_features_filename, 'wb') as unique_features_file:
         pickle.dump(unique_features_filename, unique_features_file)
@@ -35,6 +38,7 @@ def create_train_test_datasets():
     x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
                                                                                                                                     test_size=0.20,
                                                                                                                                     random_state=random_state)
+
     for purpose in ['train', 'test']:
         x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
         x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])