More metrics can be computed now

delriot · Sep 12, 2023 · 051d649 · 051d649
1 parent e9227ce
commit 051d649
Showing 13 changed files with 142 additions and 73 deletions.
diff --git a/create_clean_dataset.py b/create_clean_dataset.py
@@ -12,6 +12,7 @@
     from packages.dataset_manipulation import remove_notunique_features
 from from_poly_set_to_features import poly_set_feature_extractor
 from find_filename import find_dataset_filename
+from find_filename import find_other_filename
 
 
 def create_dataframe(dataset):
@@ -44,21 +45,26 @@ def cleaning_dataset():
     clean_dataset_filename = find_dataset_filename('clean')
     with open(dataset_filename, 'rb') as f:
         dataset = pickle.load(f)
-    original_polys_list, names, features_list, targets_list, timings_list =\
-        extract_features(dataset)
-    # working with raw features
-    features = np.array(features_list)
-    unique_names, unique_features = remove_notunique_features(names, features)
-
-    targets = np.array(targets_list)
-    timings = np.array([[convert_to_timing(timings_ordering)
-                         for timings_ordering in timings_problem]
-                        for timings_problem in timings_list])
-    original_polys = np.array(original_polys_list)
+    my_dataset = extract_features(dataset)
+    clean_dataset = dict()
+    # # working with raw features
+    # features = np.array(features_list)
+    clean_dataset['names'], clean_dataset['features'] = \
+        remove_notunique_features(my_dataset['names'],
+                                  my_dataset['features'])
+    unique_features_filename = find_other_filename("unique_features")
+    with open(unique_features_filename, 'wb') as unique_features_file:
+        pickle.dump(clean_dataset['names'], unique_features_file)
+    # Some timings are expressed as "Over 30", this is changed here
+    clean_dataset['timings'] = \
+        np.array([[convert_to_timing(timings_ordering)
+                  for timings_ordering in timings_problem]
+                  for timings_problem in my_dataset['timings']])
+    for key in my_dataset:
+        if key not in clean_dataset:
+            clean_dataset[key] = my_dataset[key]
     with open(clean_dataset_filename, 'wb') as clean_dataset_file:
-        dataset = pickle.dump((original_polys, unique_names,
-                               unique_features, targets, timings),
-                              clean_dataset_file)
+        pickle.dump(clean_dataset, clean_dataset_file)
 
 # dataset_filename = os.path.join(os.path.dirname(__file__),
 #                                 'DatasetsBeforeProcessing',

diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt
diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv
@@ -1,7 +1,7 @@
 dataset,zero,one,two,three,four,five,total
 train normal dataset,326,74,105,41,163,106,815
-train balanced dataset,125,140,134,121,136,159,815
+train balanced dataset,137,123,145,150,122,138,815
 train augmented dataset,815,815,815,815,815,815,4890
 test normal dataset,80,19,30,10,39,26,204
-test balanced dataset,30,38,36,36,30,34,204
+test balanced dataset,30,30,41,35,36,32,204
 test augmented dataset,204,204,204,204,204,204,1224

diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt
diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt
diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt
diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt
diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt
diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt
diff --git a/main.py b/main.py
@@ -12,6 +12,7 @@
 Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
 vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
 """
+import csv
 from config.ml_models import ml_models
 from config.ml_models import dataset_types
 from find_filename import find_dataset_filename
@@ -29,7 +30,7 @@
 # if tune_hyperparameters is used to decide whether to tune them
 # or to used previously tuned
 # tune_hyperparameters = False
-
+paradigm = 'classification'
 
 # cleaning_dataset()
 # create_train_test_datasets()
@@ -44,31 +45,44 @@
 #     for method in dataset_types:
 #         print(f"for {method}")
 #         train_model(ml_model, method)
-# for training_method in dataset_types:
-#     print(f"Testing models trained in {training_method}")
-#     test_results(training_method)
-
-timings = dict()
+training_method = 'augmented'
 testing_method = 'augmented'
-test_dataset_filename = find_dataset_filename('test',
-                                              testing_method)
-
-with open("classification_output_timings.csv", 'w') as f:
-    f.write("model, Normal, Balanced, Augmented\n")
+first_time = 1
+output_file = "classification_output_acc_time.csv"
 for ml_model in ml_models:
-    for training_method in dataset_types:
-        trained_model_filename = find_model_filename(training_method,
-                                                     ml_model)
-        accuracy = test_model(trained_model_filename,
-                              test_dataset_filename)
-        timings[training_method] = timings_in_test(ml_model, testing_method,
-                                                   training_method)
-        total_time = sum(timings[training_method])
-        # with open("classification_output_acc_time.csv", 'a') as f:
-        #     f.write(f"{ml_model}, {accuracy}, {total_time}\n")
-    with open("classification_output_timings.csv", 'a') as f:
-        f.write(f"{ml_model}, {sum(timings['Normal'])}, {sum(timings['Balanced'])}, {sum(timings['Augmented'])}\n")
-    timings['optimal'] = timings_in_test('optimal', testing_method)
-    print(sum(timings['optimal']))
-    from make_plots import survival_plot
-    survival_plot(timings, plot_name=f"survival_plot_{ml_model}")
+    print(f"Testing models trained in {training_method}")
+    metrics = test_model(ml_model, paradigm=training_method, testing_method=testing_method)
+    if first_time == 1:
+        first_time = 0
+        keys = list(metrics.keys())
+        with open(output_file, 'a') as f:
+            f.write(', '.join(['Model'] + keys) + '\n')
+    with open(output_file, 'a', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow([ml_model] + [metrics[key] for key in keys])
+
+
+# timings = dict()
+# testing_method = 'augmented'
+# test_dataset_filename = find_dataset_filename('test',
+#                                               testing_method)
+
+# with open("classification_output_timings.csv", 'w') as f:
+#     f.write("model, Normal, Balanced, Augmented\n")
+# for ml_model in ml_models:
+#     for training_method in dataset_types:
+#         trained_model_filename = find_model_filename(training_method,
+#                                                      ml_model)
+#         accuracy = test_model(trained_model_filename,
+#                               test_dataset_filename)
+#         timings[training_method] = timings_in_test(ml_model, testing_method,
+#                                                    training_method)
+#         total_time = sum(timings[training_method])
+#         # with open("classification_output_acc_time.csv", 'a') as f:
+#         #     f.write(f"{ml_model}, {accuracy}, {total_time}\n")
+#     with open("classification_output_timings.csv", 'a') as f:
+#         f.write(f"{ml_model}, {sum(timings['Normal'])}, {sum(timings['Balanced'])}, {sum(timings['Augmented'])}\n")
+#     timings['optimal'] = timings_in_test('optimal', testing_method)
+#     print(sum(timings['optimal']))
+#     from make_plots import survival_plot
+#     survival_plot(timings, plot_name=f"survival_plot_{ml_model}")
diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py
@@ -1,6 +1,10 @@
-"""IS THIS BEING USED?"""
+"""
+IS THIS BEING USED?
+YES, IT IS!
+"""
+
 import itertools
-from xml.sax.handler import all_features
+# from xml.sax.handler import all_features
 import numpy as np
 
 
@@ -9,7 +13,8 @@ def aveg(given_list):
 
 
 def aveg_not_zero(given_list):
-    return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0]))
+    return sum(given_list)/max(1, len([1 for elem in given_list
+                                      if elem != 0]))
 
 
 def identity(input):
@@ -30,34 +35,57 @@ def sign(input):
             raise Exception("How is this possible?")
 
 
-def create_features(degrees, variable=0, sv=False):
-    functions = [sum, max, aveg, aveg_not_zero]
+def create_features(degrees, variable=0, sv=False,
+                    include_aveg_not_zero=False):
+    if include_aveg_not_zero:
+        functions = [sum, max, aveg, aveg_not_zero]
+    else:
+        functions = [sum, max, aveg]  # , aveg_not_zero]
     sign_or_not = [identity, sign]
     features = []
     features_names = []
-    for choice in itertools.product(functions, sign_or_not, functions, sign_or_not):
-        feature_description = choice[0].__name__+"sign"*(choice[1].__name__=="sign")+"_in_polys_"+choice[2].__name__+"_"+"sign"*(choice[3].__name__=="sign")+"of_" + "sum_of_"*sv+"degrees_of_var_"+str(variable)+"_in_monomials"
-        feature_value = choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) for degrees_in_poly in degrees]))
+    for choice in itertools.product(functions,
+                                    sign_or_not, functions,
+                                    sign_or_not):
+        feature_description = (choice[0].__name__
+                               + "sign" * (choice[1].__name__ == "sign")
+                               + "_in_polys_" + choice[2].__name__ + "_"
+                               + "sign" * (choice[3].__name__ == "sign")
+                               + "of_" + "sum_of_" * sv + "degrees_of_var_"
+                               + str(variable) + "_in_monomials")
+        feature_value = \
+            choice[0](choice[1]([choice[2](choice[3](degrees_in_poly))
+                                 for degrees_in_poly in degrees]))
         features.append(feature_value)
         features_names.append(feature_description)
     return features, features_names
 
 
 def extract_features(dataset):
+    my_dataset = dict()
     all_features = []
     all_targets = []
     all_timings = []
     all_original_polynomials = []
-    for index, all_projections in enumerate(dataset[0]):
-        original_polynomials = all_projections[0][0]
+    all_projections = []
+    for index, projections in enumerate(dataset[0]):
+        all_projections.append(projections)
+        original_polynomials = projections[0][0]
         # the original polynomials are the initial polynomials of any
         # of the possible projections (also of the first one)
         all_original_polynomials.append(original_polynomials)
         all_targets.append(dataset[1][index])
         all_timings.append(dataset[2][index])
-        names, instance_features = features_from_set_of_polys(original_polynomials)
+        names, instance_features = features_from_set_of_polys(
+                                       original_polynomials)
         all_features.append(instance_features)
-    return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings)
+    my_dataset['polynomials'] = np.array(all_original_polynomials)
+    my_dataset['names'] = np.array(names)
+    my_dataset['features'] = np.array(all_features)
+    my_dataset['targets'] = np.array(all_targets)
+    my_dataset['timings'] = np.array(all_timings)
+    my_dataset['projections'] = np.array(all_projections)
+    return my_dataset
 
 
 def features_from_set_of_polys(original_polynomials):
@@ -71,8 +99,12 @@ def features_from_set_of_polys(original_polynomials):
                                                            variable=var)
         instance_features += var_features
         names += var_features_names
-        sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials]
-        svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True)
+        sdegrees = \
+            [[sum(monomial) for monomial in poly if monomial[var] != 0] + [0]
+             for poly in original_polynomials]
+        svar_features, svar_features_names = create_features(sdegrees,
+                                                             variable=var,
+                                                             sv=True)
         instance_features += svar_features
         names += svar_features_names
     return names, instance_features
diff --git a/test_train_datasets.py b/test_train_datasets.py
@@ -23,23 +23,29 @@ def count_instances(my_dataset, instance):
 def create_train_test_datasets():
     clean_dataset_filename = find_dataset_filename('clean')
     with open(clean_dataset_filename, 'rb') as clean_dataset_file:
-        _, names, features, targets, timings = pickle.load(clean_dataset_file)
-    unique_names, unique_features = remove_notunique_features(names, features)
-    # features were already unique because of create_clean_dataset
-    # decide where to remove the features
-    print("create_train_test", timings)
-    unique_features_filename = find_other_filename("unique_features")
-    with open(unique_features_filename, 'wb') as unique_features_file:
-        pickle.dump(unique_features_filename, unique_features_file)
+        dataset = pickle.load(clean_dataset_file)
+
+    ###
+    # Instead of creating dictionaries for features, labels,...abs
+    # maybe it's better to create a dictionary for each dataset:
+    # train/test, normal/balanced/augmented
+    ###
     x = dict()  # to keep the features
     y = dict()  # to keep the labels
     t = dict()  # to keep the timings
+    p = dict()  # to keep the projections
     # train and test sets are created
     random_state = 0
-    x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
-                                                                                                                                    test_size=0.20,
-                                                                                                                                    random_state=random_state)
-
+    x['train_normal'], x['test_normal'], \
+        y['train_normal'], y['test_normal'], \
+        t['train_normal'], t['test_normal'], \
+        p['train_normal'], p['test_normal'] = \
+        train_test_split(dataset['features'],
+                         dataset['targets'],
+                         dataset['timings'],
+                         dataset['projections'],
+                         test_size=0.20,
+                         random_state=random_state)
     for purpose in ['train', 'test']:
         x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
         x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
@@ -51,22 +57,28 @@ def create_train_test_datasets():
             for method in ['normal', 'balanced', 'augmented']:
                 this_dataset_filename = find_dataset_filename(usage, method=method)
                 with open(this_dataset_filename, 'wb') as this_dataset_file:
-                    pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file)
+                    if method == 'normal':
+                        pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}'], p[f'{usage}_{method}']), this_dataset_file)
+                    else:
+                        pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file)
 
                 writer.writerow([f'{usage} {method} dataset']
                                 + [str(count_instances(y[f'{usage}_{method}'], i))
                                 for i in range(6)]
                                 + [str(len(y[f'{usage}_{method}']))])
 
 
-def create_regression_datasets():
+def create_regression_datasets(taking_logarithms=True):
     for usage in ['train', 'test']:
         this_dataset_filename = find_dataset_filename(usage,
                                                       method='augmented')
         # we will use the augmented dataset here
         with open(this_dataset_filename, 'rb') as this_dataset_file:
             X, Y, T = pickle.load(this_dataset_file)
-            Y = [log(timings[0]) for timings in T] # remove log here if real times want to be given
+            if taking_logarithms:
+                Y = [log(timings[0]) for timings in T]
+            else:
+                Y = [timings[0] for timings in T]
             this_dataset_filename =\
                 find_dataset_filename(usage, method='regression')
             with open(this_dataset_filename, 'wb') as this_dataset_file:

diff --git a/train_models.py b/train_models.py
@@ -14,7 +14,12 @@ def train_model(ml_model, method):
     train_data_filename = find_dataset_filename('train', method=method)
     hyperparams_file = find_hyperparams_filename(method, ml_model)
     with open(train_data_filename, 'rb') as train_data_file:
-        x_train, y_train, _ = pickle.load(train_data_file)
+        if method == "Normal":
+            x_train, y_train, _, _ = pickle.load(train_data_file)
+        else:
+            x_train, y_train, _ = pickle.load(train_data_file)
+            # a = pickle.load(train_data_file)
+            # print(a[0], type(a), len(a), method)
     hyperparams = read_yaml_from_file(hyperparams_file)
     current_classifier = sklearn_models[ml_model]
     clf = current_classifier(**hyperparams)