Adding hyperparameter tunning

delriot · Apr 4, 2023 · c37433d · c37433d
1 parent 9772da2
commit c37433d
Showing 21 changed files with 539 additions and 41 deletions.
diff --git a/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt b/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt
diff --git a/choose_hyperparams.py b/choose_hyperparams.py
@@ -0,0 +1,106 @@
+"""
+The experiments in [1] are replicated with some changes.
+
+The first change is that the testing data is balanced, so that all targets
+are almost equally common.
+Then we use three training sets; dataset as in [1], balanced dataset
+and data augmentation dataset.
+
+[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline
+to Pick the Variable Ordering for Algorithms with Polynomial Inputs.
+Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds)
+Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
+vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
+"""
+
+
+import os
+import pickle
+import csv
+import yaml
+import importlib.util
+from config.ml_models import ml_models
+from config.ml_models import classifiers
+from config.ml_models import dataset_types
+from config.hyperparameters_grid import grid
+from sklearn.model_selection import GridSearchCV
+
+
+def write_yaml_to_file(py_obj, filename):
+    with open(f'{filename}.yaml', 'w',) as f:
+        yaml.dump(py_obj, f, sort_keys=False)
+    print('Written to file successfully')
+
+
+def k_folds_ml(x_train, y_train, model, random_state=0):
+    """
+    Train the desired model.
+
+    The hyperparameters of the models are chosen using 5-fold cross validation.
+    """
+    current_classifier = classifiers[model]
+    current_grid = grid[model]
+    rf_cv = GridSearchCV(estimator=current_classifier(),
+                         param_grid=current_grid,
+                         cv=5)
+    rf_cv.fit(x_train, y_train)
+    return rf_cv.best_params_
+
+
+test_balanced_dataset_file = os.path.join(os.path.dirname(__file__),
+                                          'datasets', 'test',
+                                          'balanced_test_dataset.txt')
+with open(test_balanced_dataset_file, 'rb') as g:
+    balanced_x_test, balanced_y_test = pickle.load(g)
+
+test_normal_dataset_file = os.path.join(os.path.dirname(__file__),
+                                        'datasets', 'test',
+                                        'normal_test_dataset.txt')
+with open(test_normal_dataset_file, 'rb') as g:
+    normal_x_test, normal_y_test = pickle.load(g)
+
+output_file_balanced = os.path.join(os.path.dirname(__file__),
+                                    'ml_results_k_fold_tested_in_balanced.csv')
+with open(output_file_balanced, 'w') as f_balanced:
+    writer_balanced = csv.writer(f_balanced)
+    writer_balanced.writerow(["Name"] + dataset_types)
+    output_file_normal = os.path.join(os.path.dirname(__file__),
+                                      'ml_results_k_fold_tested_in_normal.csv')
+    with open(output_file_normal, 'w') as f_normal:
+        writer_normal = csv.writer(f_normal)
+        writer_normal.writerow(["Name"] + dataset_types)
+        for ml_model in ml_models:
+            print(f"Model: {ml_model}")
+            acc_balanced = dict()
+            acc_normal = dict()
+            for method in dataset_types:
+                this_dataset_file = os.path.join(os.path.dirname(__file__),
+                                                 'datasets', 'train',
+                                                 f'{method}_train_dataset.txt')
+                with open(this_dataset_file, 'rb') as f:
+                    method_x_train, method_y_train = pickle.load(f)
+                hyperparams = k_folds_ml(method_x_train, method_y_train,
+                                         model=ml_model)
+                write_yaml_to_file(hyperparams,
+                                   os.path.join(os.path.dirname(__file__),
+                                                'config', 'hyperparams',
+                                                f'{method}_{ml_model}'))
+                current_classifier = classifiers[ml_model]
+                clf = current_classifier(**hyperparams)
+                clf.fit(method_x_train, method_y_train)
+                acc_balanced[method] = clf.score(balanced_x_test,
+                                                 balanced_y_test)
+                acc_normal[method] = clf.score(normal_x_test, normal_y_test)
+                method_file = os.path.join(os.path.dirname(__file__),
+                                           'config', 'models',
+                                           f'{method}_trained_model.txt')
+                with open(method_file, 'wb') as f_method:
+                    pickle.dump(clf, f_method)
+            round_accuracies_balanced = [round(acc, 2)
+                                         for acc in [acc_balanced[method_here]
+                                         for method_here in dataset_types]]
+            round_accuracies_normal = [round(acc, 2)
+                                       for acc in [acc_normal[method_here]
+                                       for method_here in dataset_types]]
+            writer_balanced.writerow([ml_model] + round_accuracies_balanced)
+            writer_normal.writerow([ml_model] + round_accuracies_normal)
diff --git a/create_clean_dataset.py b/create_clean_dataset.py
@@ -0,0 +1,26 @@
+import pickle
+import numpy as np
+from replicating_Dorians_features import extract_features
+from basic_ml import use_tf, basic_ml
+from itertools import product
+import sys
+import os
+import csv
+
+
+dataset_file = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
+f = open(dataset_file, 'rb')
+dataset = pickle.load(f)
+original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)
+
+# working with raw features
+features = np.array(features_list)
+targets = np.array(targets_list)
+timings = np.array(timings_list)
+original_polys = np.array(original_polys_list)
+
+clean_dataset_file = os.path.join(os.path.dirname(__file__),
+                                  'datasets',
+                                  'clean_dataset.txt')
+g = open(clean_dataset_file, 'wb')
+dataset = pickle.dump((original_polys, names, features, targets, timings), g)
diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt
diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv
@@ -0,0 +1,7 @@
+dataset,zero,one,two,three,four,five,total
+train normal dataset,326,74,105,41,163,106,815
+train balanced dataset,126,113,149,138,144,145,815
+train augmented dataset,815,815,815,815,815,815,4890
+test normal dataset,80,19,30,10,39,26,204
+test balanced dataset,31,34,32,38,34,35,204
+test augmented dataset,204,204,204,204,204,204,1224

diff --git a/datasets/dataset_without_repetition.txt b/datasets/dataset_without_repetition.txt
diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt
diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt
diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt
diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt
diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt
diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt
diff --git a/main.py b/main.py
@@ -18,6 +18,7 @@
 import pickle
 import random
 import csv
+import yaml
 import importlib.util
 # Check if 'dataset_manipulation' is installed
 if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
@@ -31,70 +32,105 @@
     from packages.dataset_manipulation import balance_dataset
     from packages.dataset_manipulation import augmentate_dataset
 from sklearn.preprocessing import normalize
-from preprocessing_Dorians_features import normalize_features # noqa401
 from sklearn.model_selection import train_test_split
 from basic_ml import basic_ml
+from k_folds_ml import k_folds_ml
+
+def write_yaml_to_file(py_obj,filename):
+    with open(f'{filename}.yaml', 'w',) as f :
+        yaml.dump(py_obj,f,sort_keys=False) 
+    print('Written to file successfully')
+
 
 
 names_features_targets_file = os.path.join(os.path.dirname(__file__),
                                            'datasets',
-                                           'names_features_targets.txt')
+                                           'clean_dataset.txt')
 with open(names_features_targets_file, 'rb') as f:
-    names, features, targets = pickle.load(f)
-augmented_features, augmented_targets = augmentate_dataset(features, targets)
+    original_polys, names, features, targets, timings = pickle.load(f)
+
+
+augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)
 
 normalized_augmented_features = normalize(augmented_features)
 # an alternative approach to normalizing
 # features = np.transpose(normalize_features(features))
 unique_names = name_unique_features(names,
-                                    normalized_augmented_features)
+                                    augmented_features)
 
 random_state = 0
 # Other random states may be tried to check that similar results are achieved
 random.seed(random_state)
 
 # Models that will be used are chosen
-ml_models = ['SVC', 'DT', 'KNN', 'RF', 'MPL', 'my_mlp']
+ml_models = ['KNN', 'DT', 'MLP', 'SVC', 'RF'] # , 'my_mlp'
 
 # train and test sets are created
-x_train, x_test, y_train, y_test = train_test_split(features, targets,
+x_train, x_test, y_train, y_test, t_train, t_test = train_test_split(features, targets, timings,
                                                     test_size=0.20,
                                                     random_state=random_state)
 # test features are balanced
-bal_x_test, bal_y_test = balance_dataset(x_test, y_test)
-# and the repeated features are removed before presenting them to any model
+bal_x_test, bal_y_test, bal_t_test = balance_dataset(x_test, y_test, t_test)
+# and the repeated features are removed before presenting them to any ml_model
 # we will ensure that instances send to the models dont have repeated features
 unique_bal_x_test = remove_notunique_features(unique_names, names, bal_x_test)
 # testing data for all approaches is ready
 unique_x_train = remove_notunique_features(unique_names, names, x_train)
 # training data without changes ready
-bal_x_train, bal_y_train = balance_dataset(x_train, y_train)
+bal_x_train, bal_y_train, bal_t_train = balance_dataset(x_train, y_train, t_train)
 unique_bal_x_train = remove_notunique_features(unique_names, names, bal_x_train)
 # balanced training data ready
-aug_x_train, aug_y_train = augmentate_dataset(x_train, y_train)
+aug_x_train, aug_y_train, aug_t_train = augmentate_dataset(x_train, y_train, t_train)
 unique_aug_x_train = remove_notunique_features(unique_names, names, aug_x_train)
 # augmented training data ready
 
+# output_file = os.path.join(os.path.dirname(__file__),
+#                            'ml_results.csv')
+# with open(output_file, 'w') as f:
+#     writer = csv.writer(f)
+#     writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
+#     for ml_model in ml_models:
+#         acc_basic = basic_ml(unique_x_train, unique_bal_x_test,
+#                              y_train, bal_y_test,
+#                              ml_model, random_state=random_state)
+
+#         acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test,
+#                            bal_y_train, bal_y_test,
+#                            ml_model, random_state=random_state)
+
+#         acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test,
+#                                  aug_y_train, bal_y_test,
+#                                  ml_model, random_state=random_state)
+
+#         round_accuracies = [round(acc, 2) for acc in [acc_basic,
+#                                                       acc_bal,
+#                                                       acc_augmented]]
+#         writer.writerow([ml_model] + round_accuracies)
+
+# output_file = os.path.join(os.path.dirname(__file__),
+#                            'ml_results_k_fold.csv')
+# with open(output_file, 'w') as f:
+#     writer = csv.writer(f)
+#     writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
+#     print(f"{method}")
+        # print(f"The accuracies of {ml_model} are:\n Normal: {acc_basic} \n Balanced: {acc_bal}\n Augmented: {acc_augmented}")
+
+        # round_accuracies = [round(acc, 2) for acc in [acc_basic,
+        #                                               acc_bal,
+        #                                               acc_augmented]]
+        # writer.writerow([ml_model] + round_accuracies)
 
-output_file = os.path.join(os.path.dirname(__file__),
-                           'ml_results.csv')
-with open(output_file, 'w') as f:
-    writer = csv.writer(f)
-    writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
-    for ml_model in ml_models:
-        acc_basic = basic_ml(unique_x_train, unique_bal_x_test,
-                             y_train, bal_y_test,
-                             ml_model, random_state=random_state)
-
-        acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test,
-                           bal_y_train, bal_y_test,
-                           ml_model, random_state=random_state)
-
-        acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test,
-                                 aug_y_train, bal_y_test,
-                                 ml_model, random_state=random_state)
-
-        round_accuracies = [round(acc, 2) for acc in [acc_basic,
-                                                      acc_bal,
-                                                      acc_augmented]]
-        writer.writerow([ml_model] + round_accuracies)
+x_and_y_per_method = dict()
+x_and_y_per_method['basic'] = (unique_x_train, y_train)
+x_and_y_per_method['balanced'] = (unique_bal_x_train, bal_y_train)
+x_and_y_per_method['augmented'] = (unique_aug_x_train, aug_y_train)
+for ml_model in ml_models:
+    print(f"Model: {ml_model}")
+    for method in ['basic', 'balanced', 'augmented']:
+        method_x_train, method_y_train = x_and_y_per_method[method]
+        hyperparams = k_folds_ml(method_x_train, method_y_train,
+                                        model=ml_model)
+        write_yaml_to_file(hyperparams,
+                            f'UsingDorianFeatures\\config\\hyperparams\\{method}_{ml_model}')
+        for train_data in ['basic', 'balanced']:
+            clf = ml_model()
diff --git a/ml_results.csv b/ml_results.csv
@@ -2,6 +2,6 @@ Name,Normal,Balance data,Augment data
 SVC,0.21,0.21,0.16
 DT,0.32,0.27,0.34
 KNN,0.29,0.39,0.51
-RF,0.38,0.49,0.55
+RF,0.42,0.5,0.61
 MPL,0.21,0.2,0.17
-my_mlp,0.28,0.32,0.36
+my_mlp,0.36,0.33,0.38

diff --git a/packages/build/lib/dataset_manipulation/__init__.py b/packages/build/lib/dataset_manipulation/__init__.py
@@ -0,0 +1,4 @@
+from .dataset_manipulation import augmentate_dataset  # noqa401
+from .dataset_manipulation import balance_dataset  # noqa401
+from .dataset_manipulation import name_unique_features  # noqa401
+from .dataset_manipulation import remove_notunique_features  # noqa401
diff --git a/packages/build/lib/dataset_manipulation/dataset_manipulation.py b/packages/build/lib/dataset_manipulation/dataset_manipulation.py
@@ -0,0 +1,72 @@
+"""Exploit symmetries in polynomials to augmentate or balance the dataset."""
+import numpy as np
+import math
+import random
+from .exploit_symmetries import give_all_symmetries
+
+nvar = 3
+
+
+def augmentate_dataset(features, targets):
+    """
+    Multiply the size of the dataset by 6.
+
+    Arguments:
+    features: list(list(numpy.float))
+    targets: list(numpy.float)
+    """
+    symmetric_features = []
+    symmetric_targets = []
+    for features, target in zip(features, targets):
+        symmetric_features += give_all_symmetries(features, int(target))
+        symmetric_targets += list(range(math.factorial(nvar)))
+    return np.array(symmetric_features), np.array(symmetric_targets)
+
+
+def balance_dataset(features, targets):
+    """
+    Balance the dataset so all targets are almost equally common.
+
+    Arguments:
+    features: list(list(numpy.float))
+    targets: list(numpy.float)
+    """
+    balanced_features = []
+    balanced_targets = []
+    for features, target in zip(features, targets):
+        symmetric_features = give_all_symmetries(features, int(target))
+        possible_targets = list(range(math.factorial(nvar)))
+        new_target = random.choice(possible_targets)
+        balanced_features.append(symmetric_features[new_target])
+        balanced_targets.append(new_target)
+    return np.array(balanced_features), np.array(balanced_targets)
+
+
+def name_unique_features(names, features):
+    """
+    Return the name of unique features.
+
+    When two features share the same value for all the instances
+    one of them is not considered unique.
+    """
+    new_features = []
+    new_names = []
+    rep = 0
+    for index, feature in enumerate(zip(*features)):
+        if (any([np.array_equal(feature, ex_feature)
+                 for ex_feature in new_features])
+                or np.std(feature) == 0):
+            rep += 1
+        else:
+            new_features.append(feature)
+            new_names.append(names[index])
+    return new_names
+
+
+def remove_notunique_features(unique_names, names, features):
+    """Return the features corresponding to a name in 'unique_names'."""
+    unique_features = []
+    for index, feature in enumerate(zip(*features)):
+        if names[index] in unique_names:
+            unique_features.append(feature)
+    return np.transpose(unique_features)
diff --git a/packages/build/lib/dataset_manipulation/exploit_symmetries.py b/packages/build/lib/dataset_manipulation/exploit_symmetries.py
@@ -0,0 +1,51 @@
+"""
+Exploit symmetries in three variable polynomials to generate up to six
+instances out of each existing one.
+
+The task at hand consists in classify this features.
+We will take advantage of the fact that we can change the target by
+reordering the features.
+
+This file will contain:
+- features_to_canonical_ordering: a function able to reorder the features so
+that the target becomes '1', this ordering is called the canonical order.
+- give_all_symmetries: a function that given the canonical order returns the
+reorderings for each of the possible targets '1','2',...,'6'.
+"""
+from itertools import permutations
+
+nvar = 3
+variables = list(range(nvar))
+perms = [list(elem) for elem in permutations(variables)]
+
+
+def features_to_canonical_target(features, optimal_ordering):
+    """
+    Reorder the features for the target to be '1'.
+
+    This is done by reordering the features according to the optimal variable
+    ordering of the set of polynomials.
+    """
+    variable_orderings = perms[optimal_ordering]
+    nfeatures = len(features)
+    split_features = [features[int(var*nfeatures/nvar):
+                               int((var+1)*nfeatures/nvar)]
+                      for var in range(nvar)]
+    ordered_features = [split_features[variable_orderings[i]]
+                        for i in range(nvar)]
+    return ordered_features
+
+
+def give_all_symmetries(features, optimal_ordering):
+    """Reorder the features for all possible targets."""
+    ordered_features = features_to_canonical_target(features,
+                                                    optimal_ordering)
+    all_symmetries = []
+    for perm in perms:
+        new_order_features = [0]*nvar
+        for index, var in enumerate(perm):
+            new_order_features[var] = ordered_features[index]
+        flatten_new_order_features = [elem for lst in new_order_features
+                                      for elem in lst]
+        all_symmetries.append(flatten_new_order_features)
+    return all_symmetries
diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py
@@ -7,7 +7,7 @@
 nvar = 3
 
 
-def augmentate_dataset(features, targets):
+def augmentate_dataset(features, targets, timings):
     """
     Multiply the size of the dataset by 6.
 
@@ -17,13 +17,15 @@ def augmentate_dataset(features, targets):
     """
     symmetric_features = []
     symmetric_targets = []
-    for features, target in zip(features, targets):
+    symmetric_timings = []
+    for features, target, timing in zip(features, targets, timings):
         symmetric_features += give_all_symmetries(features, int(target))
         symmetric_targets += list(range(math.factorial(nvar)))
-    return np.array(symmetric_features), np.array(symmetric_targets)
+        symmetric_timings += list(timing)
+    return np.array(symmetric_features), np.array(symmetric_targets), np.array(symmetric_timings)
 
 
-def balance_dataset(features, targets):
+def balance_dataset(features, targets, timings):
     """
     Balance the dataset so all targets are almost equally common.
 
@@ -33,13 +35,15 @@ def balance_dataset(features, targets):
     """
     balanced_features = []
     balanced_targets = []
-    for features, target in zip(features, targets):
+    balanced_timings = []
+    for features, target, timing in zip(features, targets, timings):
         symmetric_features = give_all_symmetries(features, int(target))
         possible_targets = list(range(math.factorial(nvar)))
         new_target = random.choice(possible_targets)
         balanced_features.append(symmetric_features[new_target])
         balanced_targets.append(new_target)
-    return np.array(balanced_features), np.array(balanced_targets)
+        balanced_timings.append(timing[new_target])
+    return np.array(balanced_features), np.array(balanced_targets), np.array(balanced_timings)
 
 
 def name_unique_features(names, features):
@@ -57,7 +61,13 @@ def name_unique_features(names, features):
                  for ex_feature in new_features])
                 or np.std(feature) == 0):
             rep += 1
+        elif feature.count(feature[0])==len(feature):
+            print(names[index])
         else:
+            # if 'max_in_polys_max_sig'==names[index][:20]:
+            #     print("Check ", feature.count(feature[0])==len(feature))
+            #     print(names[index])
+            # print(len(feature))
             new_features.append(feature)
             new_names.append(names[index])
     return new_names

diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py
@@ -0,0 +1,71 @@
+
+import itertools
+from xml.sax.handler import all_features
+import numpy as np
+
+nvar=3
+
+
+
+
+def aveg(given_list):
+    return sum(given_list)/len(given_list)
+
+def aveg_not_zero(given_list):
+    return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0]))
+
+def identity(input):
+    return input
+
+def sign(input):
+    if type(input)==list:
+        return [sign(elem) for elem in input]
+    else:
+        if input>0:
+            return 1
+        elif input<0:
+            return -1
+        elif input==0:
+            return 0
+        else:
+            raise Exception("How is this possible?")
+
+
+def create_features(degrees, variable=0, sv=False):
+    functions = [sum, max, aveg, aveg_not_zero]
+    sign_or_not = [identity, sign]
+    features = []
+    features_names = []
+    for choice in itertools.product(functions, sign_or_not, functions, sign_or_not):
+        feature_description = choice[0].__name__+"sign"*(choice[1].__name__=="sign")+"_in_polys_"+choice[2].__name__+"_"+"sign"*(choice[3].__name__=="sign")+"of_" + "sum_of_"*sv+"degrees_of_var_"+str(variable)+"_in_monomials"
+        feature_value = choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) for degrees_in_poly in degrees]))
+        features.append(feature_value)
+        features_names.append(feature_description)
+    return features, features_names
+
+
+def extract_features(dataset):
+    all_features = []
+    all_targets = []
+    all_timings = []
+    all_original_polynomials = []
+    for index, all_projections in enumerate(dataset[0]):
+        original_polynomials = all_projections[0][0]
+        all_original_polynomials.append(original_polynomials)
+        names = []
+        instance_features = []
+        all_targets.append(dataset[1][index])
+        all_timings.append(dataset[2][index])
+        for var in range(nvar):
+            degrees = [[monomial[var] for monomial in poly]
+                       for poly in original_polynomials]
+            var_features, var_features_names = create_features(degrees,
+                                                               variable=var)
+            instance_features += var_features
+            names += var_features_names
+            sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials]
+            svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True)
+            instance_features += svar_features
+            names += svar_features_names
+        all_features.append(instance_features)
+    return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings)
diff --git a/test_train_datasets.py b/test_train_datasets.py
@@ -0,0 +1,100 @@
+"""
+The experiments in [1] are replicated with some changes.
+
+The first change is that the testing data is balanced, so that all targets
+are almost equally common.
+Then we use three training sets; dataset as in [1], balanced dataset
+and data augmentation dataset.
+
+[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline
+to Pick the Variable Ordering for Algorithms with Polynomial Inputs.
+Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds)
+Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
+vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
+"""
+
+
+import os
+import pickle
+import random
+import csv
+import yaml
+import importlib.util
+# Check if 'dataset_manipulation' is installed
+if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
+    from dataset_manipulation import name_unique_features
+    from dataset_manipulation import remove_notunique_features
+    from dataset_manipulation import balance_dataset
+    from dataset_manipulation import augmentate_dataset
+else:
+    from packages.dataset_manipulation import name_unique_features
+    from packages.dataset_manipulation import remove_notunique_features
+    from packages.dataset_manipulation import balance_dataset
+    from packages.dataset_manipulation import augmentate_dataset
+from sklearn.preprocessing import normalize
+from sklearn.model_selection import train_test_split
+
+
+def count_instances(my_dataset, instance):
+    return sum(my_dataset==instance)
+
+
+names_features_targets_file = os.path.join(os.path.dirname(__file__),
+                                           'datasets',
+                                           'clean_dataset.txt')
+with open(names_features_targets_file, 'rb') as f:
+    original_polys, names, features, targets, timings = pickle.load(f)
+
+augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)
+
+normalized_augmented_features = normalize(augmented_features)
+unique_names = name_unique_features(names,
+                                    augmented_features)
+
+random_state = 0
+
+x = dict() # to keep the features
+y = dict() # to keep the labels
+t = dict() # to keep the timings
+# train and test sets are created
+not_unique_x_normal_train, not_unique_x_normal_test, y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(features, targets, timings,
+                                                                                           test_size=0.20,
+                                                                                           random_state=random_state)
+
+not_unique_balanced_x_test, y['test_balanced'], t['test_balanced'] = balance_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
+x['test_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_test)
+# testing data for all approaches is ready
+# all tests will be done in balanced but the others are also computed
+not_unique_augmented_x_test, y['test_augmented'], t['test_augmented'] = augmentate_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
+x['test_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_test)
+x['test_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_test)
+
+x['train_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_train)
+# normal training data ready
+not_unique_balanced_x_train, y['train_balanced'], t['train_balanced'] = balance_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
+x['train_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_train)
+# balanced training data ready
+not_unique_augmented_x_train, y['train_augmented'], t['train_augmented'] = augmentate_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
+x['train_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_train)
+# augmented training data ready
+
+
+dataset_info_file = os.path.join(os.path.dirname(__file__),
+                                 'datasets',
+                                 'dataset_instances.csv')
+with open(dataset_info_file, 'w') as f_dataset_info:
+    writer = csv.writer(f_dataset_info)
+    writer.writerow(['dataset'] + ['zero','one','two','three','four','five','total'])
+    for usage in ['train', 'test']:
+        for method in ['normal', 'balanced', 'augmented']:
+            print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
+            this_dataset_file = os.path.join(os.path.dirname(__file__),
+                                            'datasets', usage,
+                                            f'{method}_{usage}_dataset.txt')
+            with open(this_dataset_file, 'wb') as f:
+                pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)
+
+            writer.writerow([f'{usage} {method} dataset']
+                            + [str(count_instances(y[f'{usage}_{method}'], i))
+                               for i in range(6)]
+                            + [str(len(y[f'{usage}_{method}']))])
diff --git a/train_models.py b/train_models.py
@@ -0,0 +1,15 @@
+import yaml
+from yaml import UnsafeLoader
+import os
+from config.ml_models import ml_models
+from config.ml_models import dataset_types
+
+print(ml_models)
+for ml_model in ml_models:
+    for method in dataset_types:
+        filename = os.path.join(os.path.dirname(__file__),
+                                'config', 'hyperparams',
+                                f'{method}_{ml_model}.yaml')
+        with open(filename, 'r') as f:
+            hyperparameters = yaml.load(f, Loader=UnsafeLoader)
+            print(type(hyperparameters), hyperparameters)