Bug in augmentating dataset detected, results looking amazing

delriot · Sep 16, 2023 · 427c13d · 427c13d
1 parent 7a8a578
commit 427c13d
Showing 13 changed files with 294 additions and 99 deletions.
diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt
diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt
diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt
diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt
diff --git a/find_filename.py b/find_filename.py
@@ -47,3 +47,10 @@ def find_output_filename(training_method):
 def find_other_filename(search):
     return os.path.join(os.path.dirname(__file__), 'config',
                         f'{search}.txt')
+
+
+import pickle
+names_filename = find_other_filename('unique_names')
+with open(names_filename, 'rb') as names_f:
+    names = pickle.load(names_f)
+print(len(names), '\n', names[2], '\n', names[67], '\n', names[132])
diff --git a/from_poly_set_to_features.py b/from_poly_set_to_features.py
@@ -0,0 +1,127 @@
+"""This file will contain the functions necessary to convert
+a list of sets of polynomials to a list of their features.
+This features will be unique and standarised"""
+import numpy as np
+import pickle
+from packages.dataset_manipulation import augmentate_dataset
+from find_filename import find_other_filename
+from replicating_Dorians_features import features_from_set_of_polys
+
+
+def poly_set_feature_extractor(sets_of_polys, determine_unique_features=False,
+                               determine_standarization=False):
+    """Given a list of polynomial sets will return a list of its features"""
+    features_list = []
+    for set_of_polys in sets_of_polys:
+        names, features = features_from_set_of_polys(set_of_polys)
+        features_list.append(features)
+    if determine_unique_features:
+        # if we want to find unique feature names
+        find_unique_features(names, features_list)
+    unique_names, unique_features = get_unique_features(names, features_list)
+    if determine_standarization:
+        find_standarizing_values(unique_names, unique_features)
+    standarized_features = get_standarized_features(unique_names, unique_features)
+    return names, standarized_features
+
+
+# def features_set_of_polys(original_polynomials):
+#     instance_features = []
+#     names = []
+#     nvar = len(original_polynomials[0][0]) - 1
+#     for var in range(nvar):
+#         degrees = [[monomial[var] for monomial in poly]
+#                    for poly in original_polynomials]
+#         var_features, var_features_names = create_features(degrees,
+#                                                            variable=var)
+#         instance_features += var_features
+#         names += var_features_names
+#         sdegrees = [[sum(monomial) for monomial in poly
+#                      if monomial[var]!=0]+[0]
+#                      for poly in original_polynomials]
+#         svar_features, svar_features_names = create_features(sdegrees,
+#                                                              variable=var,
+#                                                              sv=True)
+#         instance_features += svar_features
+#         names += svar_features_names
+#     return names, instance_features
+
+
+def find_unique_features(names, features):
+    """
+    Saves the name of unique features in the assigned file.
+
+    When two features share the same value for all the instances,
+    or they are the same after adition or multiplication,
+    one of them is not considered unique.
+    """
+    # we want to look for uniqueness after augmenting to discard
+    # some that might look equal
+    # creating labels and timing for the augmentate_dataset function
+    labels = [0]*len(features)
+    timings = [[0, 0]]*len(features)
+    augmented_features, _, _ = augmentate_dataset(features, labels, timings)
+    # now we look for the unique features
+    unique_features = []
+    unique_names = []
+    for index, feature in enumerate(zip(*augmented_features)):
+        if (any([np.array_equal(feature, ex_feature)
+                 for ex_feature in unique_features])
+                or np.std(feature) == 0):
+            # check if this feature has been already recorded
+            pass
+        elif feature.count(feature[0]) == len(feature):
+            # check if it is a constant list
+            pass
+        else:
+            # if none of the previous conditions then
+            unique_features.append(feature)
+            unique_names.append(names[index])
+    unique_names_filename = find_other_filename('unique_names')
+    with open(unique_names_filename, 'wb') as unique_names_file:
+        pickle.dump(unique_names, unique_names_file)
+
+
+def get_unique_features(names, features):
+    """Return the features corresponding to a name in 'unique_names'."""
+    # We recover the list of unique feature names
+    unique_names_filename = find_other_filename('unique_names')
+    with open(unique_names_filename, 'rb') as unique_names_file:
+        unique_names = pickle.load(unique_names_file)
+    # we keep only the features that are unique
+    unique_features = []
+    index = 0
+    for feature in zip(*features):
+        if names[index] in unique_names:
+            unique_features.append(feature)
+        index += 1
+    return unique_names, np.transpose(unique_features)
+
+
+def find_standarizing_values(names, features_list):
+    """Finds and saves the mean and std of the different features
+    so that features can be standarised in a consistent way
+    before giving them to the machine learning models"""
+    standarizing_values = dict()
+    for name, features in zip(names, features_list):
+        standarizing_values[name] = (np.mean(features), np.std(features))
+    standarizing_values_filename = find_other_filename('standarizing_values')
+    with open(standarizing_values_filename, 'wb') as standarizing_values_file:
+        pickle.dump(standarizing_values, standarizing_values_file)
+
+
+def get_standarized_features(names, features):
+    """Returns the standarised features."""
+    # We recover the list of unique feature names
+    standarizing_values_filename = find_other_filename('standarizing_values')
+    with open(standarizing_values_filename, 'rb') as standarizing_values_file:
+        standarizing_values = pickle.load(standarizing_values_file)
+    # we keep only the features that are unique
+    standarized_features = []
+    # for featurex in zip(*features):
+    #     print(type(featurex), len(features))
+    index = 0
+    for index, feature in enumerate(zip(*features)):
+        mean, std = standarizing_values[names[index]]
+        standarized_features.append((feature-mean)/std)
+    return np.transpose(standarized_features)
diff --git a/main.py b/main.py
@@ -30,7 +30,7 @@
 # Hyperparameter tuning take a very long time,
 # if tune_hyperparameters is used to decide whether to tune them
 # or to used previously tuned
-tune_hyperparameters = True
+tune_hyperparameters = False
 paradigm = 'classification'
 
 # cleaning_dataset()
@@ -41,11 +41,11 @@
         for method in dataset_qualities:
             print(f"Choosing hyperparameters for {ml_model} in {method}")
             choose_hyperparams(ml_model, method)
-for ml_model in ml_models:
-    print(f"Training {ml_model}")
-    for method in dataset_qualities:
-        print(f"for {method}")
-        train_model(ml_model, method)
+# for ml_model in ml_models:
+#     print(f"Training {ml_model}")
+#     for method in dataset_qualities:
+#         print(f"for {method}")
+#         train_model(ml_model, method)
 training_method = 'augmented'
 testing_method = 'augmented'
 first_time = 1

diff --git a/main_heuristics.py b/main_heuristics.py
@@ -2,33 +2,61 @@
 import math
 import pickle
 import random
-import numpy as np
-from Heuristics.heuristics_guess import not_greedy_heuristic_guess
-from Heuristics.heuristics_guess import choose_order_given_projections
+# import numpy as np
+# from Heuristics.heuristics_guess import not_greedy_heuristic_guess
+# from Heuristics.heuristics_guess import choose_order_given_projections
 from find_filename import find_dataset_filename
 from test_models import compute_metrics
 
 nvar = 3
-testing_method = 'Normal'
+testing_method = 'Augmented'
 test_dataset_filename = find_dataset_filename('Test',
                                               testing_method)
 with open(test_dataset_filename, 'rb') as test_dataset_file:
     testing_dataset = pickle.load(test_dataset_file)
 output_file = "heuristics_output_acc_time.csv"
 
+
+# TESTING GMODS IN AUUGMENTED : Features 2, 67 and 132
+def choose_gmods(features):
+    a = []
+    # print(features)
+    a.append(features[2])
+    a.append(features[67])
+    a.append(features[132])
+    if a[0]==min(a):
+        if a[1]<=a[2]:
+            return 0
+        else:
+            return 1
+    elif a[1]==min(a):
+        if a[0]<=a[2]:
+            return 2
+        else:
+            return 3
+    elif a[2]==min(a):
+        if a[0]<=a[1]:
+            return 4
+        else:
+            return 5
+
 # Testing in heuristics that make all the choice at once
 first_heuristic = 1
-for heuristic in ['gmods', 'brown', 'random', 'virtual best']:
-    reps = 100
+# for heuristic in ['T1', 'gmods', 'brown', 'random', 'virtual best']:
+for heuristic in ['gmods', 'virtual best']:
+    reps = 10
     sum_metrics = dict()
     for i in range(reps):
         if heuristic == 'virtual best':
-            chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
+            # chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
+            chosen_indices = testing_dataset['labels']
         elif heuristic == 'random':
             chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
         else:
-            chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic)
-                              for projection in testing_dataset['projections']]
+            # chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic)
+            #                   for projection in testing_dataset['projections']]
+            chosen_indices = [choose_gmods(features)
+                              for features in testing_dataset['features']]
         metrics = compute_metrics(chosen_indices,
                                   testing_dataset['labels'],
                                   testing_dataset['timings'],
@@ -38,8 +66,8 @@
         else:
             sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
     aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
-    augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}
-    
+    augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(1)*aveg_metrics[key] for key in sum_metrics}
+
     print(heuristic, augmented_metrics)
     if first_heuristic == 1:
         first_heuristic = 0
@@ -51,37 +79,37 @@
         writer = csv.writer(f)
         writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])
 
-# Testing on greedy heuristics
-for heuristic in ['brown', 'gmods', 'random', 'virtual best']:
-    reps = 100
-    sum_metrics = dict()
-    for i in range(reps):
-        if heuristic == 'virtual best':
-            chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
-        elif heuristic == 'random':
-            chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
-        else:
-            chosen_indices = [choose_order_given_projections(projection, heuristic)
-                              for projection in testing_dataset['projections']]
-        metrics = compute_metrics(chosen_indices,
-                                  testing_dataset['labels'],
-                                  testing_dataset['timings'],
-                                  testing_dataset['cells'])
-        if len(sum_metrics) == 0:
-            sum_metrics = metrics
-        else:
-            sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
-    aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
-    augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}
+# # Testing on greedy heuristics
+# for heuristic in ['brown', 'gmods', 'random', 'virtual best']:
+#     reps = 100
+#     sum_metrics = dict()
+#     for i in range(reps):
+#         if heuristic == 'virtual best':
+#             chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
+#         elif heuristic == 'random':
+#             chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
+#         else:
+#             chosen_indices = [choose_order_given_projections(projection, heuristic)
+#                               for projection in testing_dataset['projections']]
+#         metrics = compute_metrics(chosen_indices,
+#                                   testing_dataset['labels'],
+#                                   testing_dataset['timings'],
+#                                   testing_dataset['cells'])
+#         if len(sum_metrics) == 0:
+#             sum_metrics = metrics
+#         else:
+#             sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
+#     aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
+#     augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}
 
-    print(heuristic, augmented_metrics)
-    if first_heuristic == 1:
-        first_heuristic = 0
-        keys = list(augmented_metrics.keys())
-        with open(output_file, 'a') as f:
-            f.write('Now choosing greedily \n')
-            f.write(', '.join(['Model'] + keys) + '\n')
-    with open(output_file, 'a', newline='') as f:
-        writer = csv.writer(f)
-        writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])
-# print(sum(min(timings) for timings in testing_dataset['timings']))
+#     print(heuristic, augmented_metrics)
+#     if first_heuristic == 1:
+#         first_heuristic = 0
+#         keys = list(augmented_metrics.keys())
+#         with open(output_file, 'a') as f:
+#             f.write('Now choosing greedily \n')
+#             f.write(', '.join(['Model'] + keys) + '\n')
+#     with open(output_file, 'a', newline='') as f:
+#         writer = csv.writer(f)
+#         writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])
+# # print(sum(min(timings) for timings in testing_dataset['timings']))
diff --git a/main_regression.py b/main_regression.py
@@ -26,13 +26,13 @@
 # Hyperparameter tuning take a very long time,
 # if tune_hyperparameters is used to decide whether to tune them
 # or to used previously tuned
-tune_hyperparameters = True
+tune_hyperparameters = False
 taking_logarithms = False
 
 for i in range(1):
     # cleaning_dataset()
     # create_train_test_datasets()
-    # create_regression_datasets(taking_logarithms=taking_logarithms)
+    create_regression_datasets(taking_logarithms=taking_logarithms)
 
     paradigm = "regression"
     if tune_hyperparameters:
@@ -62,7 +62,7 @@
             first_time = 0
             keys = list(metrics.keys())
             with open(output_file, 'a') as f:
-                f.write('No more cheating; no taking logarithms also\n')
+                f.write('After changing dataset\n')
                 f.write(', '.join(['Model'] + keys) + '\n')
         with open(output_file, 'a', newline='') as f:
             writer = csv.writer(f)

diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py
@@ -4,35 +4,53 @@
 import random
 from .exploit_symmetries import give_all_symmetries
 from .exploit_symmetries import augmentate_timings
+from itertools import permutations
 # from sklearn.preprocessing import normalize
 
 nvar = 3
 
 
-def augmentate_dataset(features, targets, timings, cells):
+def augmentate_instance(features, timings, cells, nvar):
+    variables = list(range(nvar))
+    split_features = [features[i*len(features)//nvar:(i+1)*len(features)//nvar]
+                      for i in range(nvar)]
+    dict_timings = {str(perm): timing for perm, timing
+                    in zip(permutations(variables), timings)}
+    dict_cells = {str(perm): cell for perm, cell in zip(permutations(variables), cells)}
+    augmented_features, augmented_timings, augmented_cells = [], [], []
+    for perm in permutations(variables):
+        augmented_features.append([feature for i in perm
+                                  for feature in split_features[i]])
+        augmented_timings.append([dict_timings[str(double_perm)]
+                                  for double_perm in permutations(perm)])
+        augmented_cells.append([dict_cells[str(double_perm)]
+                                for double_perm in permutations(perm)])
+    return augmented_features, augmented_timings, augmented_cells
+
+
+
+def augmentate_dataset(all_features, all_timings, all_cells, nvar):
     """
-    Multiply the size of the dataset by 6.
+    Multiply the size of the dataset by math.factorial(nvar).
 
     Arguments:
     features: list(list(numpy.float))
     targets: list(numpy.float)
     """
-    symmetric_features = []
-    symmetric_targets = []
-    symmetric_timings = []
-    symmetric_cells = []
-    for features, target, timing, cell in \
-            zip(features, targets, timings, cells):
-        symmetric_features += give_all_symmetries(features, int(target))
-        symmetric_targets += list(range(math.factorial(nvar)))
-        symmetric_timings += augmentate_timings(timing, int(target))
-        symmetric_cells += augmentate_timings(cell, int(target))
-
-    return np.array(symmetric_features), np.array(symmetric_targets), \
-        np.array(symmetric_timings), np.array(symmetric_cells)
-
-
-def balance_dataset(features, targets, timings, cells):
+    augmented_features = []
+    augmented_timings = []
+    augmented_cells = []
+    for features, timings, cells in \
+            zip(all_features, all_timings, all_cells):
+        new_features, new_timings, new_cells = \
+            augmentate_instance(features, timings, cells, nvar)
+        augmented_features += new_features
+        augmented_timings += new_timings
+        augmented_cells += new_cells
+    return augmented_features, augmented_timings, augmented_cells
+
+
+def balance_dataset(all_features, all_timings, all_cells, nvar):
     """
     Balance the dataset so all targets are almost equally common.
 
@@ -41,21 +59,22 @@ def balance_dataset(features, targets, timings, cells):
     targets: list(numpy.float)
     """
     balanced_features = []
-    balanced_targets = []
     balanced_timings = []
     balanced_cells = []
-    for features, target, timing, cell in \
-            zip(features, targets, timings, cells):
-        symmetric_features = give_all_symmetries(features, int(target))
-        symmetric_timings = augmentate_timings(timing, int(target))
-        symmetric_cells = augmentate_timings(cell, int(target))
+    for features, timings, cells in \
+            zip(all_features, all_timings, all_cells):
         new_target = random.choice(list(range(math.factorial(nvar))))
-        balanced_features.append(symmetric_features[new_target])
-        balanced_targets.append(new_target)
-        balanced_timings.append(symmetric_timings[new_target])
-        balanced_cells.append(symmetric_cells[new_target])
-    return np.array(balanced_features), np.array(balanced_targets),\
-        np.array(balanced_timings), np.array(balanced_cells)
+        new_features, new_timings, new_cells = \
+            augmentate_instance(features, timings, cells, nvar)
+        balanced_features.append(new_features[new_target])
+        balanced_timings.append(new_timings[new_target])
+        balanced_cells.append(new_cells[new_target])
+    return balanced_features, balanced_timings, balanced_cells
+
+# features = [1,2,3,4,5,6]
+# timings = [10,20,30,40,50,60]
+# cells = [21,32,43,54,65,76]
+# print(balance_dataset([features], [timings], [cells], 3))
 
 
 def name_unique_features(names, features):

diff --git a/test_models.py b/test_models.py
@@ -1,18 +1,20 @@
 import csv
+import math
 import pickle
 import importlib.util
 import numpy as np
 from sklearn import metrics
 from config.general_values import dataset_qualities
 from config.ml_models import ml_models
+from config.ml_models import ml_regressors
 from find_filename import find_output_filename
 from find_filename import find_dataset_filename
 from find_filename import find_model_filename
 # Check if 'dataset_manipulation' is installed
 if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
-    from exploit_symmetries import give_all_symmetries
+    from dataset_manipulation import augmentate_instance
 else:
-    from packages.dataset_manipulation.exploit_symmetries import give_all_symmetries
+    from packages.dataset_manipulation.dataset_manipulation import augmentate_instance
 
 
 # def test_model(trained_model_filename, test_dataset_filename):
@@ -104,8 +106,15 @@ def test_model(ml_model, paradigm, testing_method='augmented'):
         model = pickle.load(trained_model_file)
     with open(test_dataset_filename, 'rb') as test_dataset_file:
         testing_dataset = pickle.load(test_dataset_file)
-    chosen_indices = [return_regressor_choice(model, features)
-                      for features in testing_dataset['features']]
+    print("here")
+    if ml_model in ml_regressors:
+        chosen_indices = [return_regressor_choice(model, features)
+                          for features in testing_dataset['features']]
+    else:
+        chosen_indices = [model.predict([features])[0]
+                          for features in testing_dataset['features']]
+    print(chosen_indices)
+    print("here2")
     return compute_metrics(chosen_indices,
                            testing_dataset['labels'],
                            testing_dataset['timings'],
@@ -123,6 +132,7 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells):
             zip(chosen_indices, labels, all_timings, all_cells):
         if chosen_index == label:
             correct += 1
+        print(timings, chosen_index)
         if timings[chosen_index] not in [30, 60]:
             metrics['Completed'] += 1
         metrics['Total time'] += timings[chosen_index]
@@ -135,11 +145,17 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells):
 
 
 def return_regressor_choice(model, features):
-    features_all_symmetries = give_all_symmetries(features)
+    nvar = 3 ## Make this better
+    made_up_timings = list(range(math.factorial(nvar)))
+    made_up_cells = list(range(math.factorial(nvar)))
+    augmentated_features, _, _ = \
+        augmentate_instance(features, made_up_timings, made_up_cells, nvar)
     y_op = float('inf')
-    for index, x_features in enumerate(features_all_symmetries):
-        # print(x_features)
+    for index, x_features in enumerate(augmentated_features):
         y_pred = model.predict([x_features])
+        ########
+        # THIS IS NOT A LIST??
+        ########
         # print(y_pred)
         if y_op > y_pred:
             y_op = y_pred

diff --git a/test_train_datasets.py b/test_train_datasets.py
@@ -57,20 +57,24 @@ def create_train_test_datasets():
                          dataset['cells'],
                          test_size=0.20,
                          random_state=random_state)
-    keys = ['features', 'labels', 'timings', 'cells']
+    keys = ['features', 'timings', 'cells']
     for purpose in purposes:
         datasets[f'{purpose}_Balanced'] = \
             {key: elem for key,
              elem in zip(keys, balance_dataset(
                                    *[datasets[f'{purpose}_Normal'][key2]
-                                     for key2 in keys]))
+                                     for key2 in keys], nvar=3)) ##CHOOSE NVAR WELL
              }
+        datasets[f'{purpose}_Balanced']['labels'] = \
+            [timings.index(min(timings)) for timings in datasets[f'{purpose}_Balanced']['timings']]
         datasets[f'{purpose}_Augmented'] = \
             {key: elem for key,
              elem in zip(keys, augmentate_dataset(
                                    *[datasets[f'{purpose}_Normal'][key2]
-                                     for key2 in keys]))
+                                     for key2 in keys], nvar=3))
              }
+        datasets[f'{purpose}_Augmented']['labels'] = \
+            [timings.index(min(timings)) for timings in datasets[f'{purpose}_Augmented']['timings']]
     for purpose in purposes:
         for quality in dataset_qualities:
             this_dataset_filename = \

diff --git a/train_models.py b/train_models.py
@@ -48,11 +48,6 @@ def train_regression_model(ml_model, method):
     # trained_model_filename = find_model_filename(method, ml_model, 'regression')
     # with open(trained_model_filename, 'wb') as trained_model_file:
     #     pickle.dump(reg, trained_model_file)
-    print("Real")
-    print(train_dataset['timings'][10:20])
-    print("Predicted")
-    print(reg.predict(train_dataset['features'])[10:20])
-    print(metrics.mean_squared_error(reg.predict(train_dataset['features']), train_dataset['timings']))
     return reg
 
 
@@ -130,7 +125,6 @@ def get_vars_features(polynomials):
     unique_features_filename = find_other_filename("unique_features")
     with open(unique_features_filename, 'wb') as unique_features_file:
         unique_names = pickle.load(unique_features_file)
-    print(unique_names)
     for var in range(nvar):
         var_features, var_names = \
             compute_features_for_var(polynomials, var)