Now heuristics and regression also available

delriot · Sep 14, 2023 · 94aaa66 · 94aaa66
1 parent 2c5d807
commit 94aaa66
Showing 3 changed files with 203 additions and 0 deletions.
diff --git a/main_heuristics.py b/main_heuristics.py
@@ -0,0 +1,87 @@
+import csv
+import math
+import pickle
+import random
+import numpy as np
+from Heuristics.heuristics_guess import not_greedy_heuristic_guess
+from Heuristics.heuristics_guess import choose_order_given_projections
+from find_filename import find_dataset_filename
+from test_models import compute_metrics
+
+nvar = 3
+testing_method = 'Normal'
+test_dataset_filename = find_dataset_filename('Test',
+                                              testing_method)
+with open(test_dataset_filename, 'rb') as test_dataset_file:
+    testing_dataset = pickle.load(test_dataset_file)
+output_file = "heuristics_output_acc_time.csv"
+
+# Testing in heuristics that make all the choice at once
+first_heuristic = 1
+for heuristic in ['gmods', 'brown', 'random', 'virtual best']:
+    reps = 100
+    sum_metrics = dict()
+    for i in range(reps):
+        if heuristic == 'virtual best':
+            chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
+        elif heuristic == 'random':
+            chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
+        else:
+            chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic)
+                              for projection in testing_dataset['projections']]
+        metrics = compute_metrics(chosen_indices,
+                                  testing_dataset['labels'],
+                                  testing_dataset['timings'],
+                                  testing_dataset['cells'])
+        if len(sum_metrics) == 0:
+            sum_metrics = metrics
+        else:
+            sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
+    aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
+    augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}
+
+    print(heuristic, augmented_metrics)
+    if first_heuristic == 1:
+        first_heuristic = 0
+        keys = list(augmented_metrics.keys())
+        with open(output_file, 'a') as f:
+            f.write('Choosing the whole ordering in the beggining \n')
+            f.write(', '.join(['Model'] + keys) + '\n')
+    with open(output_file, 'a', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])
+
+# Testing on greedy heuristics
+for heuristic in ['brown', 'gmods', 'random', 'virtual best']:
+    reps = 100
+    sum_metrics = dict()
+    for i in range(reps):
+        if heuristic == 'virtual best':
+            chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
+        elif heuristic == 'random':
+            chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
+        else:
+            chosen_indices = [choose_order_given_projections(projection, heuristic)
+                              for projection in testing_dataset['projections']]
+        metrics = compute_metrics(chosen_indices,
+                                  testing_dataset['labels'],
+                                  testing_dataset['timings'],
+                                  testing_dataset['cells'])
+        if len(sum_metrics) == 0:
+            sum_metrics = metrics
+        else:
+            sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
+    aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
+    augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}
+
+    print(heuristic, augmented_metrics)
+    if first_heuristic == 1:
+        first_heuristic = 0
+        keys = list(augmented_metrics.keys())
+        with open(output_file, 'a') as f:
+            f.write('Now choosing greedily \n')
+            f.write(', '.join(['Model'] + keys) + '\n')
+    with open(output_file, 'a', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])
+# print(sum(min(timings) for timings in testing_dataset['timings']))
diff --git a/main_regression.py b/main_regression.py
@@ -0,0 +1,68 @@
+"""
+The experiments in [1] are replicated with some changes.
+
+The first change is that the testing data is balanced, so that all labels
+are almost equally common.
+Then we use three training sets; dataset as in [1], balanced dataset
+and data augmentation dataset.
+
+[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline
+to Pick the Variable Ordering for Algorithms with Polynomial Inputs.
+Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds)
+Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
+vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
+"""
+import csv
+from config.ml_models import ml_regressors
+from create_clean_dataset import cleaning_dataset
+from test_train_datasets import create_train_test_datasets
+from test_train_datasets import create_regression_datasets
+from choose_hyperparams import choose_hyperparams
+from train_models import train_model
+# from test_models import test_regressor
+from test_models import test_model
+
+
+# Hyperparameter tuning take a very long time,
+# if tune_hyperparameters is used to decide whether to tune them
+# or to used previously tuned
+tune_hyperparameters = False
+taking_logarithms = False
+
+
+# cleaning_dataset()
+# create_train_test_datasets()
+create_regression_datasets(taking_logarithms=taking_logarithms)
+
+paradigm = "regression"
+# if tune_hyperparameters:
+#     for ml_model in ml_regressors:
+#         print(f"Choosing hyperparameters for {ml_model} in {paradigm}")
+#         choose_hyperparams(ml_model, paradigm)
+for ml_model in ml_regressors:
+    print(f"Training {ml_model}")
+    print(f"for {paradigm}")
+    train_model(ml_model, paradigm)
+testing_method = 'augmented'
+output_file = "regression_output_acc_time.csv"
+# with open(output_file, 'a') as f:
+#     f.write("Now without logarithms and without aveg_not_zero\n")
+
+first_time = 1
+for ml_model in ml_regressors:
+    ###
+    # For KNNR running properly X.shape[0] has been changed to len(X)
+    # in line 240 of
+    # C:\Software\Python37\Lib\site-packages\sklearn\neighbors\_regression.py
+    print(f"Testing models trained in {ml_model}")
+    metrics = test_model(ml_model, paradigm=paradigm,
+                         testing_method=testing_method)
+    if first_time == 1:
+        first_time = 0
+        keys = list(metrics.keys())
+        with open(output_file, 'a') as f:
+            f.write('No more cheating; no taking logarithms also\n')
+            f.write(', '.join(['Model'] + keys) + '\n')
+    with open(output_file, 'a', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow([ml_model] + [metrics[key] for key in keys])
diff --git a/make_plots.py b/make_plots.py
@@ -0,0 +1,48 @@
+"""Make some plots"""
+import os
+import numpy as np
+from numpy import sort
+import matplotlib
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import cm
+matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
+matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
+matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
+matplotlib.rcParams['mathtext.fontset'] = 'cm'
+matplotlib.rcParams['font.family'] = 'STIXGeneral'
+
+fontsize = 15
+desired_font = {'fontname': 'monospace'}
+matplotlib.rcParams.update({'font.size': fontsize})
+
+
+def survival_plot(timings: dict, plot_name="survival_plot"):
+    """Receive a dictionary where the keys are the name
+    of the methos and the timings that took for each of
+    the problems"""
+    color = cm.rainbow(np.linspace(0, 1, len(timings)+1))
+    # color[4]=[0.8,0.8,0.2,1]
+    # color[3]=[0.65,0.42,0.42,1]
+    # color[2]=[0.00,1,0.5,1]
+    # color = ['0','0.5','0','0.5','0','0.5']
+    style = ['--'] * len(timings)
+    dashes = [(1, 0), (5, 1), (5, 1, 1, 1), (2, 1, 2, 1), (1, 1), (5, 5)]\
+        + [(1, 0)] * len(timings)
+
+    for method, c, s, d in zip(timings, color, style, dashes):
+        not_timeout_timings = [timing for timing in timings[method]
+                               if timing != 30 and timing != 60]
+        sorted_timings = sort(not_timeout_timings)
+        accumulative_timings = [sum(sorted_timings[:i])
+                                for i in range(len(sorted_timings))]
+        # plotting
+        plt.plot(accumulative_timings, list(range(len(accumulative_timings))),
+                 s, color=c, label=method, dashes=d)
+    plt.xlabel('Time', fontsize=fontsize)
+    plt.ylabel('No. problems finished', fontsize=fontsize)
+    plt.legend(prop={'family': 'monospace', 'size': fontsize-2},
+               loc='lower right')
+    figure_location = os.path.join(os.path.dirname(__file__), 'Art',
+                                   f'{plot_name}.png')
+    plt.savefig(figure_location)
+    plt.cla()