diff --git a/main_heuristics.py b/main_heuristics.py new file mode 100644 index 0000000..339aa9f --- /dev/null +++ b/main_heuristics.py @@ -0,0 +1,87 @@ +import csv +import math +import pickle +import random +import numpy as np +from Heuristics.heuristics_guess import not_greedy_heuristic_guess +from Heuristics.heuristics_guess import choose_order_given_projections +from find_filename import find_dataset_filename +from test_models import compute_metrics + +nvar = 3 +testing_method = 'Normal' +test_dataset_filename = find_dataset_filename('Test', + testing_method) +with open(test_dataset_filename, 'rb') as test_dataset_file: + testing_dataset = pickle.load(test_dataset_file) +output_file = "heuristics_output_acc_time.csv" + +# Testing in heuristics that make all the choice at once +first_heuristic = 1 +for heuristic in ['gmods', 'brown', 'random', 'virtual best']: + reps = 100 + sum_metrics = dict() + for i in range(reps): + if heuristic == 'virtual best': + chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] + elif heuristic == 'random': + chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] + else: + chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic) + for projection in testing_dataset['projections']] + metrics = compute_metrics(chosen_indices, + testing_dataset['labels'], + testing_dataset['timings'], + testing_dataset['cells']) + if len(sum_metrics) == 0: + sum_metrics = metrics + else: + sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} + aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} + augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} + + print(heuristic, augmented_metrics) + if first_heuristic == 1: + first_heuristic = 0 + keys = list(augmented_metrics.keys()) + with open(output_file, 'a') as f: + f.write('Choosing the whole ordering in the beggining \n') + f.write(', '.join(['Model'] + keys) + '\n') + with open(output_file, 'a', newline='') as f: + writer = csv.writer(f) + writer.writerow([heuristic] + [augmented_metrics[key] for key in keys]) + +# Testing on greedy heuristics +for heuristic in ['brown', 'gmods', 'random', 'virtual best']: + reps = 100 + sum_metrics = dict() + for i in range(reps): + if heuristic == 'virtual best': + chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] + elif heuristic == 'random': + chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] + else: + chosen_indices = [choose_order_given_projections(projection, heuristic) + for projection in testing_dataset['projections']] + metrics = compute_metrics(chosen_indices, + testing_dataset['labels'], + testing_dataset['timings'], + testing_dataset['cells']) + if len(sum_metrics) == 0: + sum_metrics = metrics + else: + sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} + aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} + augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} + + print(heuristic, augmented_metrics) + if first_heuristic == 1: + first_heuristic = 0 + keys = list(augmented_metrics.keys()) + with open(output_file, 'a') as f: + f.write('Now choosing greedily \n') + f.write(', '.join(['Model'] + keys) + '\n') + with open(output_file, 'a', newline='') as f: + writer = csv.writer(f) + writer.writerow([heuristic] + [augmented_metrics[key] for key in keys]) +# print(sum(min(timings) for timings in testing_dataset['timings'])) diff --git a/main_regression.py b/main_regression.py new file mode 100644 index 0000000..fa30ab4 --- /dev/null +++ b/main_regression.py @@ -0,0 +1,68 @@ +""" +The experiments in [1] are replicated with some changes. + +The first change is that the testing data is balanced, so that all labels +are almost equally common. +Then we use three training sets; dataset as in [1], balanced dataset +and data augmentation dataset. + +[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline +to Pick the Variable Ordering for Algorithms with Polynomial Inputs. +Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds) +Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, +vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 +""" +import csv +from config.ml_models import ml_regressors +from create_clean_dataset import cleaning_dataset +from test_train_datasets import create_train_test_datasets +from test_train_datasets import create_regression_datasets +from choose_hyperparams import choose_hyperparams +from train_models import train_model +# from test_models import test_regressor +from test_models import test_model + + +# Hyperparameter tuning take a very long time, +# if tune_hyperparameters is used to decide whether to tune them +# or to used previously tuned +tune_hyperparameters = False +taking_logarithms = False + + +# cleaning_dataset() +# create_train_test_datasets() +create_regression_datasets(taking_logarithms=taking_logarithms) + +paradigm = "regression" +# if tune_hyperparameters: +# for ml_model in ml_regressors: +# print(f"Choosing hyperparameters for {ml_model} in {paradigm}") +# choose_hyperparams(ml_model, paradigm) +for ml_model in ml_regressors: + print(f"Training {ml_model}") + print(f"for {paradigm}") + train_model(ml_model, paradigm) +testing_method = 'augmented' +output_file = "regression_output_acc_time.csv" +# with open(output_file, 'a') as f: +# f.write("Now without logarithms and without aveg_not_zero\n") + +first_time = 1 +for ml_model in ml_regressors: + ### + # For KNNR running properly X.shape[0] has been changed to len(X) + # in line 240 of + # C:\Software\Python37\Lib\site-packages\sklearn\neighbors\_regression.py + print(f"Testing models trained in {ml_model}") + metrics = test_model(ml_model, paradigm=paradigm, + testing_method=testing_method) + if first_time == 1: + first_time = 0 + keys = list(metrics.keys()) + with open(output_file, 'a') as f: + f.write('No more cheating; no taking logarithms also\n') + f.write(', '.join(['Model'] + keys) + '\n') + with open(output_file, 'a', newline='') as f: + writer = csv.writer(f) + writer.writerow([ml_model] + [metrics[key] for key in keys]) diff --git a/make_plots.py b/make_plots.py new file mode 100644 index 0000000..7967127 --- /dev/null +++ b/make_plots.py @@ -0,0 +1,48 @@ +"""Make some plots""" +import os +import numpy as np +from numpy import sort +import matplotlib +import matplotlib.pyplot as plt +from matplotlib.pyplot import cm +matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans' +matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic' +matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold' +matplotlib.rcParams['mathtext.fontset'] = 'cm' +matplotlib.rcParams['font.family'] = 'STIXGeneral' + +fontsize = 15 +desired_font = {'fontname': 'monospace'} +matplotlib.rcParams.update({'font.size': fontsize}) + + +def survival_plot(timings: dict, plot_name="survival_plot"): + """Receive a dictionary where the keys are the name + of the methos and the timings that took for each of + the problems""" + color = cm.rainbow(np.linspace(0, 1, len(timings)+1)) + # color[4]=[0.8,0.8,0.2,1] + # color[3]=[0.65,0.42,0.42,1] + # color[2]=[0.00,1,0.5,1] + # color = ['0','0.5','0','0.5','0','0.5'] + style = ['--'] * len(timings) + dashes = [(1, 0), (5, 1), (5, 1, 1, 1), (2, 1, 2, 1), (1, 1), (5, 5)]\ + + [(1, 0)] * len(timings) + + for method, c, s, d in zip(timings, color, style, dashes): + not_timeout_timings = [timing for timing in timings[method] + if timing != 30 and timing != 60] + sorted_timings = sort(not_timeout_timings) + accumulative_timings = [sum(sorted_timings[:i]) + for i in range(len(sorted_timings))] + # plotting + plt.plot(accumulative_timings, list(range(len(accumulative_timings))), + s, color=c, label=method, dashes=d) + plt.xlabel('Time', fontsize=fontsize) + plt.ylabel('No. problems finished', fontsize=fontsize) + plt.legend(prop={'family': 'monospace', 'size': fontsize-2}, + loc='lower right') + figure_location = os.path.join(os.path.dirname(__file__), 'Art', + f'{plot_name}.png') + plt.savefig(figure_location) + plt.cla()