Update

delriot · Jun 11, 2023 · e181625 · e181625
1 parent 35cd856
commit e181625
Showing 19 changed files with 250 additions and 145 deletions.
diff --git a/choose_hyperparams.py b/choose_hyperparams.py
@@ -1,19 +1,3 @@
-"""
-The experiments in [1] are replicated with some changes.
-
-The first change is that the testing data is balanced, so that all targets
-are almost equally common.
-Then we use three training sets; dataset as in [1], balanced dataset
-and data augmentation dataset.
-
-[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline
-to Pick the Variable Ordering for Algorithms with Polynomial Inputs.
-Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds)
-Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
-vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
-"""
-
-
 import os
 import pickle
 import csv
@@ -47,66 +31,71 @@ def choose_hyperparams(ml_model, method):
     chosen by cross validation is created"""
     this_dataset_file = find_dataset_filename('train', method=method)
     with open(this_dataset_file, 'rb') as f:
-        method_x_train, method_y_train = pickle.load(f)
-    hyperparams = k_folds_ml(method_x_train, method_y_train, model=ml_model)
+        x_train, y_train, _ = pickle.load(f)
+    hyperparams = k_folds_ml(x_train, y_train, model=ml_model)
     hyperparams_filename = find_hyperparams_filename(method, ml_model)
     write_yaml_to_file(hyperparams, hyperparams_filename)
 
 
-test_balanced_dataset_file = os.path.join(os.path.dirname(__file__),
-                                          'datasets', 'test',
-                                          'balanced_test_dataset.txt')
-with open(test_balanced_dataset_file, 'rb') as g:
-    balanced_x_test, balanced_y_test = pickle.load(g)
+# test_balanced_dataset_file = os.path.join(os.path.dirname(__file__),
+#                                           'datasets', 'test',
+#                                           'balanced_test_dataset.txt')
+# with open(test_balanced_dataset_file, 'rb') as g:
+#     balanced_x_test, balanced_y_test = pickle.load(g)
+
+# test_normal_dataset_file = os.path.join(os.path.dirname(__file__),
+#                                         'datasets', 'test',
+#                                         'normal_test_dataset.txt')
+# with open(test_normal_dataset_file, 'rb') as g:
+#     normal_x_test, normal_y_test = pickle.load(g)
+
+# output_file_balanced = os.path.join(os.path.dirname(__file__),
+#                                     'ml_results_k_fold_tested_in_balanced.csv')
+# with open(output_file_balanced, 'w') as f_balanced:
+#     writer_balanced = csv.writer(f_balanced)
+#     writer_balanced.writerow(["Name"] + dataset_types)
+#     output_file_normal = os.path.join(os.path.dirname(__file__),
+#                                       'ml_results_k_fold_tested_in_normal.csv')
+#     with open(output_file_normal, 'w') as f_normal:
+#         writer_normal = csv.writer(f_normal)
+#         writer_normal.writerow(["Name"] + dataset_types)
+#         for ml_model in ml_models:
+#             print(f"Model: {ml_model}")
+#             acc_balanced = dict()
+#             acc_normal = dict()
+#             for method in dataset_types:
+#                 this_dataset_file = os.path.join(os.path.dirname(__file__),
+#                                                  'datasets', 'train',
+#                                                  f'{method}_train_dataset.txt')
+#                 with open(this_dataset_file, 'rb') as f:
+#                     x_train, y_train, _ = pickle.load(f)
+#                 hyperparams = k_folds_ml(x_train, y_train,
+#                                          model=ml_model)
+#                 write_yaml_to_file(hyperparams,
+#                                    os.path.join(os.path.dirname(__file__),
+#                                                 'config', 'hyperparams',
+#                                                 f'{method}_{ml_model}'))
+#                 current_classifier = classifiers[ml_model]
+#                 clf = current_classifier(**hyperparams)
+#                 clf.fit(x_train, y_train)
+#                 acc_balanced[method] = clf.score(balanced_x_test,
+#                                                  balanced_y_test)
+#                 acc_normal[method] = clf.score(normal_x_test, normal_y_test)
+#                 method_filename = os.path.join(os.path.dirname(__file__),
+#                                                'config', 'models',
+#                                                f'{method}_trained_model.txt')
+#                 with open(method_filename, 'wb') as method_file:
+#                     pickle.dump(clf, method_file)
+#             round_accuracies_balanced = [round(acc, 2)
+#                                          for acc in [acc_balanced[method_here]
+#                                          for method_here in dataset_types]]
+#             round_accuracies_normal = [round(acc, 2)
+#                                        for acc in [acc_normal[method_here]
+#                                        for method_here in dataset_types]]
+#             writer_balanced.writerow([ml_model] + round_accuracies_balanced)
+#             writer_normal.writerow([ml_model] + round_accuracies_normal)
 
-test_normal_dataset_file = os.path.join(os.path.dirname(__file__),
-                                        'datasets', 'test',
-                                        'normal_test_dataset.txt')
-with open(test_normal_dataset_file, 'rb') as g:
-    normal_x_test, normal_y_test = pickle.load(g)
 
-output_file_balanced = os.path.join(os.path.dirname(__file__),
-                                    'ml_results_k_fold_tested_in_balanced.csv')
-with open(output_file_balanced, 'w') as f_balanced:
-    writer_balanced = csv.writer(f_balanced)
-    writer_balanced.writerow(["Name"] + dataset_types)
-    output_file_normal = os.path.join(os.path.dirname(__file__),
-                                      'ml_results_k_fold_tested_in_normal.csv')
-    with open(output_file_normal, 'w') as f_normal:
-        writer_normal = csv.writer(f_normal)
-        writer_normal.writerow(["Name"] + dataset_types)
-        for ml_model in ml_models:
-            print(f"Model: {ml_model}")
-            acc_balanced = dict()
-            acc_normal = dict()
-            for method in dataset_types:
-                this_dataset_file = os.path.join(os.path.dirname(__file__),
-                                                 'datasets', 'train',
-                                                 f'{method}_train_dataset.txt')
-                with open(this_dataset_file, 'rb') as f:
-                    method_x_train, method_y_train = pickle.load(f)
-                hyperparams = k_folds_ml(method_x_train, method_y_train,
-                                         model=ml_model)
-                write_yaml_to_file(hyperparams,
-                                   os.path.join(os.path.dirname(__file__),
-                                                'config', 'hyperparams',
-                                                f'{method}_{ml_model}'))
-                current_classifier = classifiers[ml_model]
-                clf = current_classifier(**hyperparams)
-                clf.fit(method_x_train, method_y_train)
-                acc_balanced[method] = clf.score(balanced_x_test,
-                                                 balanced_y_test)
-                acc_normal[method] = clf.score(normal_x_test, normal_y_test)
-                method_file = os.path.join(os.path.dirname(__file__),
-                                           'config', 'models',
-                                           f'{method}_trained_model.txt')
-                with open(method_file, 'wb') as f_method:
-                    pickle.dump(clf, f_method)
-            round_accuracies_balanced = [round(acc, 2)
-                                         for acc in [acc_balanced[method_here]
-                                         for method_here in dataset_types]]
-            round_accuracies_normal = [round(acc, 2)
-                                       for acc in [acc_normal[method_here]
-                                       for method_here in dataset_types]]
-            writer_balanced.writerow([ml_model] + round_accuracies_balanced)
-            writer_normal.writerow([ml_model] + round_accuracies_normal)
+# model = 'KNN'
+# method = 'balanced'
+# choose_hyperparams(model, method)
diff --git a/create_clean_dataset.py b/create_clean_dataset.py
@@ -1,3 +1,8 @@
+"""This file contains a function that given the raw dataset containing
+the sets of polynomials and its timings for each order, creates a dataset
+containing a set of unique features and its class"""
+
+import os
 import pickle
 import numpy as np
 from replicating_Dorians_features import extract_features
@@ -6,6 +11,30 @@
     from dataset_manipulation import remove_notunique_features
 else:
     from packages.dataset_manipulation import remove_notunique_features
+from from_poly_set_to_features import poly_set_feature_extractor
+
+
+def create_dataframe(dataset):
+    all_features = []
+    all_targets = dataset[1][:]
+    all_timings = dataset[2][:]
+    all_original_polynomials = []
+    for index, all_projections in enumerate(dataset[0]):
+        original_polynomials = all_projections[0][0]
+        all_original_polynomials.append(original_polynomials)
+    names, all_features = poly_set_feature_extractor(all_original_polynomials,
+                                                     determine_standarization=True,
+                                                     determine_unique_features=True)
+    return np.array(all_original_polynomials), np.array(names),\
+        np.array(all_features), np.array(all_targets), np.array(all_timings)
+
+
+dataset_filename = os.path.join(os.path.dirname(__file__),
+                                'DatasetsBeforeProcessing',
+                                'dataset_without_repetition_return_ncells.txt')
+with open(dataset_filename, 'rb') as f:
+    dataset = pickle.load(f)
+original_polys_list, names, features_list, targets_list, timings_list = create_dataframe(dataset)
 
 
 def cleaning_dataset(dataset_filename, clean_dataset_filename):
@@ -20,7 +49,6 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename):
     targets = np.array(targets_list)
     timings = np.array(timings_list)
     original_polys = np.array(original_polys_list)
-
     with open(clean_dataset_filename, 'wb') as clean_dataset_file:
         dataset = pickle.dump((original_polys, unique_names,
                                unique_features, targets, timings),

diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv
@@ -1,7 +1,7 @@
 dataset,zero,one,two,three,four,five,total
 train normal dataset,326,74,105,41,163,106,815
-train balanced dataset,130,120,135,143,135,152,815
+train balanced dataset,118,136,125,149,134,153,815
 train augmented dataset,815,815,815,815,815,815,4890
 test normal dataset,80,19,30,10,39,26,204
-test balanced dataset,34,31,32,37,39,31,204
+test balanced dataset,39,32,36,29,31,37,204
 test augmented dataset,204,204,204,204,204,204,1224

diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt
diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt
diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt
diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt
diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt
diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt
diff --git a/main.py b/main.py
@@ -22,16 +22,20 @@
 from test_models import test_results
 
 
-original_dataset_file = find_dataset_filename('unclean')
-clean_dataset_filename = find_dataset_filename('clean')
-cleaning_dataset(original_dataset_file, clean_dataset_filename)
-create_train_test_datasets()
+# original_dataset_file = find_dataset_filename('unclean')
+# clean_dataset_filename = find_dataset_filename('clean')
+# cleaning_dataset(original_dataset_file, clean_dataset_filename)
+# create_train_test_datasets()
 
+# for ml_model in ml_models:
+#     for method in dataset_types:
+#         print(f"Choosing hyperparameters for {ml_model} in {method}")
+#         choose_hyperparams(ml_model, method)
 for ml_model in ml_models:
+    print(f"Training {ml_model}")
     for method in dataset_types:
-        choose_hyperparams(ml_model, method)
-for ml_model in ml_models:
-    for method in dataset_types:
+        print(f"for {method}")
         train_model(ml_model, method)
-for testing_method in ['normal', 'balanced']:
+for testing_method in dataset_types:
+    print(f"Testing {testing_method}")
     test_results(testing_method)
diff --git a/packages/build/lib/dataset_manipulation/__init__.py b/packages/build/lib/dataset_manipulation/__init__.py
@@ -1,4 +1,5 @@
 from .dataset_manipulation import augmentate_dataset  # noqa401
 from .dataset_manipulation import balance_dataset  # noqa401
 from .dataset_manipulation import name_unique_features  # noqa401
-from .dataset_manipulation import remove_notunique_features  # noqa401
+from .dataset_manipulation import remove_notunique_features  # noqa401
+from .exploit_symmetries import give_all_symmetries  # noqa401
diff --git a/packages/build/lib/dataset_manipulation/dataset_manipulation.py b/packages/build/lib/dataset_manipulation/dataset_manipulation.py
@@ -3,11 +3,12 @@
 import math
 import random
 from .exploit_symmetries import give_all_symmetries
+# from sklearn.preprocessing import normalize
 
 nvar = 3
 
 
-def augmentate_dataset(features, targets):
+def augmentate_dataset(features, targets, timings):
     """
     Multiply the size of the dataset by 6.
 
@@ -17,13 +18,16 @@ def augmentate_dataset(features, targets):
     """
     symmetric_features = []
     symmetric_targets = []
-    for features, target in zip(features, targets):
+    symmetric_timings = []
+    for features, target, timing in zip(features, targets, timings):
         symmetric_features += give_all_symmetries(features, int(target))
         symmetric_targets += list(range(math.factorial(nvar)))
-    return np.array(symmetric_features), np.array(symmetric_targets)
+        symmetric_timings += list(timing)
+    return np.array(symmetric_features), np.array(symmetric_targets), \
+        np.array(symmetric_timings)
 
 
-def balance_dataset(features, targets):
+def balance_dataset(features, targets, timings):
     """
     Balance the dataset so all targets are almost equally common.
 
@@ -33,13 +37,16 @@ def balance_dataset(features, targets):
     """
     balanced_features = []
     balanced_targets = []
-    for features, target in zip(features, targets):
+    balanced_timings = []
+    for features, target, timing in zip(features, targets, timings):
         symmetric_features = give_all_symmetries(features, int(target))
         possible_targets = list(range(math.factorial(nvar)))
         new_target = random.choice(possible_targets)
         balanced_features.append(symmetric_features[new_target])
         balanced_targets.append(new_target)
-    return np.array(balanced_features), np.array(balanced_targets)
+        balanced_timings.append(timing[new_target])
+    return np.array(balanced_features), np.array(balanced_targets),\
+        np.array(balanced_timings)
 
 
 def name_unique_features(names, features):
@@ -57,16 +64,36 @@ def name_unique_features(names, features):
                  for ex_feature in new_features])
                 or np.std(feature) == 0):
             rep += 1
+        elif feature.count(feature[0]) == len(feature):
+            print(names[index])
         else:
+            # if 'max_in_polys_max_sig'==names[index][:20]:
+            #     print("Check ", feature.count(feature[0])==len(feature))
+            #     print(names[index])
+            # print(len(feature))
             new_features.append(feature)
             new_names.append(names[index])
     return new_names
 
 
-def remove_notunique_features(unique_names, names, features):
+def get_unique_feature_names(unique_names, names, features):
     """Return the features corresponding to a name in 'unique_names'."""
     unique_features = []
     for index, feature in enumerate(zip(*features)):
         if names[index] in unique_names:
             unique_features.append(feature)
     return np.transpose(unique_features)
+
+
+def remove_notunique_features(names, features):
+    # creating some targets and timing because the function requires them
+    targets = [0]*len(features)
+    timings = [[0, 0]]*len(features)
+    augmented_features, _, _ = augmentate_dataset(features, targets, timings)
+    # normalized_augmented_features = normalize(augmented_features)
+    unique_names = name_unique_features(names, augmented_features)
+    unique_features = []
+    for index, feature in enumerate(zip(*features)):
+        if names[index] in unique_names:
+            unique_features.append(feature)
+    return unique_names, np.transpose(unique_features)
diff --git a/packages/build/lib/dataset_manipulation/exploit_symmetries.py b/packages/build/lib/dataset_manipulation/exploit_symmetries.py
@@ -37,7 +37,9 @@ def features_to_canonical_target(features, optimal_ordering):
 
 
 def give_all_symmetries(features, optimal_ordering):
-    """Reorder the features for all possible targets."""
+    """Reorder the features for all possible targets.
+    Returns a list of of all symmetries, the first one
+    corresponding to the optimal ordering"""
     ordered_features = features_to_canonical_target(features,
                                                     optimal_ordering)
     all_symmetries = []

diff --git a/packages/dataset_manipulation/__init__.py b/packages/dataset_manipulation/__init__.py
@@ -1,4 +1,5 @@
 from .dataset_manipulation import augmentate_dataset  # noqa401
 from .dataset_manipulation import balance_dataset  # noqa401
 from .dataset_manipulation import name_unique_features  # noqa401
-from .dataset_manipulation import remove_notunique_features  # noqa401
+from .dataset_manipulation import remove_notunique_features  # noqa401
+from .exploit_symmetries import give_all_symmetries  # noqa401
diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py
@@ -3,7 +3,7 @@
 import math
 import random
 from .exploit_symmetries import give_all_symmetries
-from sklearn.preprocessing import normalize
+# from sklearn.preprocessing import normalize
 
 nvar = 3
 
@@ -23,7 +23,8 @@ def augmentate_dataset(features, targets, timings):
         symmetric_features += give_all_symmetries(features, int(target))
         symmetric_targets += list(range(math.factorial(nvar)))
         symmetric_timings += list(timing)
-    return np.array(symmetric_features), np.array(symmetric_targets), np.array(symmetric_timings)
+    return np.array(symmetric_features), np.array(symmetric_targets), \
+        np.array(symmetric_timings)
 
 
 def balance_dataset(features, targets, timings):
@@ -44,7 +45,8 @@ def balance_dataset(features, targets, timings):
         balanced_features.append(symmetric_features[new_target])
         balanced_targets.append(new_target)
         balanced_timings.append(timing[new_target])
-    return np.array(balanced_features), np.array(balanced_targets), np.array(balanced_timings)
+    return np.array(balanced_features), np.array(balanced_targets),\
+        np.array(balanced_timings)
 
 
 def name_unique_features(names, features):
@@ -58,11 +60,14 @@ def name_unique_features(names, features):
     new_names = []
     rep = 0
     for index, feature in enumerate(zip(*features)):
+        # print(feature)
+        # if any([type(xfeature) == str for xfeature in feature]):
+        # print(feature)
         if (any([np.array_equal(feature, ex_feature)
                  for ex_feature in new_features])
                 or np.std(feature) == 0):
             rep += 1
-        elif feature.count(feature[0])==len(feature):
+        elif feature.count(feature[0]) == len(feature):
             print(names[index])
         else:
             # if 'max_in_polys_max_sig'==names[index][:20]:
@@ -86,7 +91,7 @@ def get_unique_feature_names(unique_names, names, features):
 def remove_notunique_features(names, features):
     # creating some targets and timing because the function requires them
     targets = [0]*len(features)
-    timings = [[0,0]]*len(features)
+    timings = [[0, 0]]*len(features)
     augmented_features, _, _ = augmentate_dataset(features, targets, timings)
     # normalized_augmented_features = normalize(augmented_features)
     unique_names = name_unique_features(names, augmented_features)

diff --git a/packages/dataset_manipulation/exploit_symmetries.py b/packages/dataset_manipulation/exploit_symmetries.py
@@ -37,7 +37,9 @@ def features_to_canonical_target(features, optimal_ordering):
 
 
 def give_all_symmetries(features, optimal_ordering):
-    """Reorder the features for all possible targets."""
+    """Reorder the features for all possible targets.
+    Returns a list of of all symmetries, the first one
+    corresponding to the optimal ordering"""
     ordered_features = features_to_canonical_target(features,
                                                     optimal_ordering)
     all_symmetries = []

diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py
@@ -3,29 +3,28 @@
 from xml.sax.handler import all_features
 import numpy as np
 
-nvar=3
-
-
-
 
 def aveg(given_list):
     return sum(given_list)/len(given_list)
 
+
 def aveg_not_zero(given_list):
     return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0]))
 
+
 def identity(input):
     return input
 
+
 def sign(input):
-    if type(input)==list:
+    if type(input) == list:
         return [sign(elem) for elem in input]
     else:
-        if input>0:
+        if input > 0:
             return 1
-        elif input<0:
+        elif input < 0:
             return -1
-        elif input==0:
+        elif input == 0:
             return 0
         else:
             raise Exception("How is this possible?")
@@ -51,21 +50,29 @@ def extract_features(dataset):
     all_original_polynomials = []
     for index, all_projections in enumerate(dataset[0]):
         original_polynomials = all_projections[0][0]
+        # the original polynomials are the initial polynomials of any
+        # of the possible projections (also of the first one)
         all_original_polynomials.append(original_polynomials)
-        names = []
-        instance_features = []
         all_targets.append(dataset[1][index])
         all_timings.append(dataset[2][index])
-        for var in range(nvar):
-            degrees = [[monomial[var] for monomial in poly]
-                       for poly in original_polynomials]
-            var_features, var_features_names = create_features(degrees,
-                                                               variable=var)
-            instance_features += var_features
-            names += var_features_names
-            sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials]
-            svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True)
-            instance_features += svar_features
-            names += svar_features_names
+        names, instance_features = features_from_set_of_polys(original_polynomials)
         all_features.append(instance_features)
     return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings)
+
+
+def features_from_set_of_polys(original_polynomials):
+    instance_features = []
+    names = []
+    nvar = len(original_polynomials[0][0]) - 1
+    for var in range(nvar):
+        degrees = [[monomial[var] for monomial in poly]
+                   for poly in original_polynomials]
+        var_features, var_features_names = create_features(degrees,
+                                                           variable=var)
+        instance_features += var_features
+        names += var_features_names
+        sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials]
+        svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True)
+        instance_features += svar_features
+        names += svar_features_names
+    return names, instance_features
diff --git a/test_train_datasets.py b/test_train_datasets.py
@@ -1,20 +1,3 @@
-"""
-The experiments in [1] are replicated with some changes.
-
-The first change is that the testing data is balanced, so that all targets
-are almost equally common.
-Then we use three training sets; dataset as in [1], balanced dataset
-and data augmentation dataset.
-
-[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline
-to Pick the Variable Ordering for Algorithms with Polynomial Inputs.
-Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds)
-Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
-vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
-"""
-
-
-import os
 import pickle
 import csv
 import importlib.util
@@ -29,6 +12,7 @@
     from packages.dataset_manipulation import augmentate_dataset
 from sklearn.model_selection import train_test_split
 from find_filename import find_dataset_filename
+from find_filename import find_other_filename
 
 
 def count_instances(my_dataset, instance):
@@ -40,7 +24,9 @@ def create_train_test_datasets():
     with open(clean_dataset_filename, 'rb') as clean_dataset_file:
         _, names, features, targets, timings = pickle.load(clean_dataset_file)
     unique_names, unique_features = remove_notunique_features(names, features)
-
+    unique_features_filename = find_other_filename("unique_features")
+    with open(unique_features_filename, 'wb') as unique_features_file:
+        pickle.dump(unique_features_filename, unique_features_file)
     x = dict()  # to keep the features
     y = dict()  # to keep the labels
     t = dict()  # to keep the timings
@@ -58,11 +44,9 @@ def create_train_test_datasets():
         writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total'])
         for usage in ['train', 'test']:
             for method in ['normal', 'balanced', 'augmented']:
-                this_dataset_file = os.path.join(os.path.dirname(__file__),
-                                                 'datasets', usage,
-                                                 f'{method}_{usage}_dataset.txt')
-                with open(this_dataset_file, 'wb') as f:
-                    pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)
+                this_dataset_filename = find_dataset_filename(usage, method=method)
+                with open(this_dataset_filename, 'wb') as this_dataset_file:
+                    pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file)
 
                 writer.writerow([f'{usage} {method} dataset']
                                 + [str(count_instances(y[f'{usage}_{method}'], i))

diff --git a/train_models.py b/train_models.py
@@ -1,20 +1,75 @@
 import pickle
 from yaml_tools import read_yaml_from_file
 from config.ml_models import classifiers
+from config.ml_models import ml_regressors
+from config.ml_models import regressors
 from find_filename import find_dataset_filename
 from find_filename import find_hyperparams_filename
 from find_filename import find_model_filename
+from dataset_manipulation import give_all_symmetries
+import numpy as np
+from sklearn import metrics
 
 
 def train_model(ml_model, method):
     train_data_filename = find_dataset_filename('train', method=method)
     hyperparams_file = find_hyperparams_filename(method, ml_model)
     with open(train_data_filename, 'rb') as train_data_file:
-        x_train, y_train = pickle.load(train_data_file)
+        x_train, y_train, _ = pickle.load(train_data_file)
     hyperparams = read_yaml_from_file(hyperparams_file)
     current_classifier = classifiers[ml_model]
     clf = current_classifier(**hyperparams)
     clf.fit(x_train, y_train)
     trained_model_filename = find_model_filename(method, ml_model)
     with open(trained_model_filename, 'wb') as trained_model_file:
         pickle.dump(clf, trained_model_file)
+
+
+def train_regression_model(ml_model, method):
+    train_data_filename = find_dataset_filename('train', method=method)
+    with open(train_data_filename, 'rb') as train_data_file:
+        x_train, _, t_train = pickle.load(train_data_file)
+    # hyperparams_file = find_hyperparams_filename(method, ml_model)
+    # hyperparams = read_yaml_from_file(hyperparams_file)
+    x_train = np.asarray([x_t for x_t, t_t in zip(x_train, t_train)
+                          if t_t[:4] != 'Over'], dtype=float)
+    t_train = np.asarray([t_t for t_t in t_train 
+                          if t_t[:4] != 'Over'], dtype=float)
+    current_classifier = regressors[ml_model]
+    # print(t_train)
+    print("her")
+    reg = current_classifier()  # **hyperparams)
+    reg.fit(x_train, t_train)
+    # trained_model_filename = find_model_filename(method, ml_model, 'regression')
+    # with open(trained_model_filename, 'wb') as trained_model_file:
+    #     pickle.dump(reg, trained_model_file)
+    print("Real")
+    print(t_train[10:20])
+    print("Predicted")
+    print(reg.predict(x_train)[10:20])
+    print(metrics.mean_squared_error(reg.predict(x_train), t_train))
+    return reg
+
+
+def choose_using_regression(x_test, regressor):
+    timings = regressor.predict(give_all_symmetries(x_test, 0))
+    return np.argmin(timings)
+
+
+def test_regression_model(method, regressor):
+    test_data_filename = find_dataset_filename('test', method=method)
+    with open(test_data_filename, 'rb') as test_data_file:
+        x_test, y_test, t_test = pickle.load(test_data_file)
+    x_test = np.asarray([x_t for x_t, t_t in zip(x_test, t_test)
+                         if t_t[:4] != 'Over'], dtype=float)
+    y_test = np.asarray([y_t for y_t, t_t in zip(y_test, t_test)
+                         if t_t[:4] != 'Over'], dtype=float)
+    y_pred = [choose_using_regression(x_i, regressor) for x_i in x_test]
+    print("ACC", metrics.accuracy_score(y_test, y_pred))
+
+
+# for ml_reg in ml_regressors:
+#     print(ml_reg)
+#     regressor = train_regression_model(ml_reg, 'balanced')
+#     print(ml_reg)
+#     test_regression_model('balanced', regressor)