diff --git a/config/general_values.py b/config/general_values.py index 46b5805..f9a5d8d 100644 --- a/config/general_values.py +++ b/config/general_values.py @@ -1,3 +1,15 @@ purposes = ['Train', 'Test'] dataset_qualities = ['Normal', 'Balanced', 'Augmented'] + + +def aveg(given_list): + return sum(given_list)/len(given_list) + + +def aveg_not_zero(given_list): + return sum(given_list)/max(1, len([1 for elem in given_list + if elem != 0])) + + +operations = [sum, max, aveg] # , aveg_not_zero diff --git a/config/hyperparameters_grid.py b/config/hyperparameters_grid.py index af84889..cc0182a 100644 --- a/config/hyperparameters_grid.py +++ b/config/hyperparameters_grid.py @@ -2,16 +2,10 @@ grid = dict() grid['RF'] = { - # 'n_estimators': [200, 300, 400, 500], - # 'max_features': ['sqrt', 'log2'], - # 'max_depth': [4, 5, 6, 7, 8], - # 'criterion': ['gini', 'entropy'] - 'n_estimators': [50, 100, 200], - 'criterion': ['gini', 'entropy'], - 'max_depth': [None, 10, 20, 30], - 'min_samples_split': [2, 5, 10], - 'min_samples_leaf': [1, 2, 4], - 'class_weight': [None, 'balanced'], + 'n_estimators': [200, 300, 400, 500], + 'max_features': ['sqrt', 'log2'], + 'max_depth': [4, 5, 6, 7, 8], + 'criterion': ['gini', 'entropy'] } grid['KNN'] = { 'n_neighbors': [1,3,5,7,12], @@ -41,18 +35,9 @@ } grid['RFR'] = { - # 'n_estimators': [200, 300, 400, 500], - # 'max_features': ['sqrt', 'log2'], - # 'max_depth': [4, 5, 6, 7, 8], - # 'criterion': ['squared_error', 'entropy'] - # # 'criterion': ['squared_error', 'friedman_mse'], - # # "max_depth": [1,3,7], - # # "min_samples_leaf": [1,5,10], - 'n_estimators': [50, 100, 200], - 'criterion': ['mse', 'mae'], - 'max_depth': [None, 10, 20, 30], - 'min_samples_split': [2, 5, 10], - 'min_samples_leaf': [1, 2, 4], + 'criterion': ['squared_error', 'friedman_mse'], + "max_depth": [1,3,7], + "min_samples_leaf": [1,5,10], } grid['KNNR'] = { 'n_neighbors': [3, 5, 10], @@ -82,4 +67,4 @@ grid['SGD'] = { 'loss':["squared_error", "huber", "epsilon_insensitive"], 'penalty':["l2", "l1", "elasticnet"] -} \ No newline at end of file +} diff --git a/config/hyperparams/augmented_DT.yaml b/config/hyperparams/augmented_DT.yaml index a9e3b69..5e7ea5d 100644 --- a/config/hyperparams/augmented_DT.yaml +++ b/config/hyperparams/augmented_DT.yaml @@ -1,3 +1,3 @@ criterion: gini -max_depth: 16 -splitter: best +max_depth: 19 +splitter: random diff --git a/config/hyperparams/augmented_KNN.yaml b/config/hyperparams/augmented_KNN.yaml index 80fcee0..ea5b9b4 100644 --- a/config/hyperparams/augmented_KNN.yaml +++ b/config/hyperparams/augmented_KNN.yaml @@ -1,3 +1,3 @@ -algorithm: kd_tree +algorithm: auto n_neighbors: 12 weights: distance diff --git a/config/hyperparams/balanced_DT.yaml b/config/hyperparams/balanced_DT.yaml index da4ceb5..82d03ca 100644 --- a/config/hyperparams/balanced_DT.yaml +++ b/config/hyperparams/balanced_DT.yaml @@ -1,3 +1,3 @@ criterion: gini -max_depth: 7 +max_depth: 4 splitter: best diff --git a/config/hyperparams/balanced_KNN.yaml b/config/hyperparams/balanced_KNN.yaml index 710b5f6..6b4c149 100644 --- a/config/hyperparams/balanced_KNN.yaml +++ b/config/hyperparams/balanced_KNN.yaml @@ -1,3 +1,3 @@ -algorithm: auto +algorithm: ball_tree n_neighbors: 1 weights: uniform diff --git a/config/hyperparams/normal_DT.yaml b/config/hyperparams/normal_DT.yaml index 5ed1b07..bfd1d81 100644 --- a/config/hyperparams/normal_DT.yaml +++ b/config/hyperparams/normal_DT.yaml @@ -1,3 +1,3 @@ criterion: gini -max_depth: 13 +max_depth: 10 splitter: random diff --git a/config/hyperparams/normal_KNN.yaml b/config/hyperparams/normal_KNN.yaml index d7863e4..b1680ba 100644 --- a/config/hyperparams/normal_KNN.yaml +++ b/config/hyperparams/normal_KNN.yaml @@ -1,3 +1,3 @@ algorithm: auto -n_neighbors: 7 +n_neighbors: 5 weights: distance diff --git a/config/ml_models.py b/config/ml_models.py index 3cdc1b8..8db0663 100644 --- a/config/ml_models.py +++ b/config/ml_models.py @@ -13,19 +13,19 @@ from sklearn.neighbors import KNeighborsRegressor ml_models = [ - # 'KNN', - # 'DT', - # 'SVC', + 'KNN', + 'DT', + 'SVC', 'RF', - # 'MLP' + 'MLP' ] ml_regressors = [ - # 'DTR', - # 'SVR', + 'DTR', + 'SVR', 'RFR', - # 'KNNR', - # 'MLPR' + 'KNNR', + 'MLPR' ] sklearn_models = { diff --git a/create_clean_dataset.py b/create_clean_dataset.py index 97c3114..ec56e2f 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -2,6 +2,7 @@ the sets of polynomials and its timings for each order, creates a dataset containing a set of unique features and its class""" +import re import pickle import numpy as np from replicating_Dorians_features import extract_features @@ -52,6 +53,7 @@ def cleaning_dataset(): clean_dataset['names'], clean_dataset['features'] = \ remove_notunique_features(my_dataset['names'], my_dataset['features']) + print("features in normal", len(my_dataset['features'][0])) unique_features_filename = find_other_filename("unique_features") with open(unique_features_filename, 'wb') as unique_features_file: pickle.dump(clean_dataset['names'], unique_features_file) @@ -60,10 +62,13 @@ def cleaning_dataset(): np.array([[convert_to_timing(timings_ordering) for timings_ordering in timings_problem] for timings_problem in my_dataset['timings']]) + # Some cells are expressed as "Over 30", this is changed here + clean_dataset['cells'] = \ + np.array([convert_to_cells(cells_problem) + for cells_problem in my_dataset['cells']]) for key in my_dataset: if key not in clean_dataset: clean_dataset[key] = my_dataset[key] - print("CLEAN", clean_dataset.keys()) with open(clean_dataset_filename, 'wb') as clean_dataset_file: pickle.dump(clean_dataset, clean_dataset_file) @@ -76,9 +81,32 @@ def cleaning_dataset(): # cleaning_dataset(dataset_filename, clean_dataset_filename) -def convert_to_timing(timing_str): - if timing_str == "Over 30": - return 60 - if timing_str == "Over 60": - return 120 +def convert_to_timing(timing_str, penalization=2): + if not contains_float(timing_str): + print(penalization * float(timing_str[5:])) + return penalization * float(timing_str[5:]) return float(timing_str) + + +def convert_to_cells(cells, penalization=2): + int_cells = [int(cell) if contains_int(cell) else cell + for cell in cells] + max_cells = max([cell for cell in int_cells if type(cell) == int]) + penalization_cells = [cell if type(cell) == int + else penalization*max_cells + for cell in int_cells] + return penalization_cells + + +def contains_float(input_str): + float_pattern = r'^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$' + match = re.search(float_pattern, input_str) + return match is not None + + +def contains_int(input_str): + int_pattern = r'^[-+]?\d+$' + match = re.match(int_pattern, input_str) + return match is not None + +cleaning_dataset() \ No newline at end of file diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt index f9786e1..2a4b33d 100644 Binary files a/datasets/clean_dataset.txt and b/datasets/clean_dataset.txt differ diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index a600bea..48558c8 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index 4c7cf43..aacd1c6 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt index 9c61a27..092e878 100644 Binary files a/datasets/test/normal_test_dataset.txt and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index b97c71d..1092295 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index bc476ed..4cd23cf 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt index 795a448..3034b94 100644 Binary files a/datasets/train/normal_train_dataset.txt and b/datasets/train/normal_train_dataset.txt differ diff --git a/find_filename.py b/find_filename.py index 2e02e8e..feb911e 100644 --- a/find_filename.py +++ b/find_filename.py @@ -1,5 +1,5 @@ import os -from config.general_values import dataset_qualities +# from config.general_values import dataset_qualities from config.general_values import purposes @@ -49,8 +49,6 @@ def find_other_filename(search): f'{search}.txt') -import pickle -names_filename = find_other_filename('unique_names') -with open(names_filename, 'rb') as names_f: - names = pickle.load(names_f) -print(len(names), '\n', names[2], '\n', names[67], '\n', names[132]) +def find_timings_lists(model): + return os.path.join(os.path.dirname(__file__), 'results', + 'timings_lists', f'{model}.txt') diff --git a/main.py b/main.py index 51382b1..f5c052d 100644 --- a/main.py +++ b/main.py @@ -31,6 +31,7 @@ # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned tune_hyperparameters = False +train_the_models = True paradigm = 'classification' cleaning_dataset() @@ -41,11 +42,12 @@ for method in dataset_qualities: print(f"Choosing hyperparameters for {ml_model} in {method}") choose_hyperparams(ml_model, method) -# for ml_model in ml_models: -# print(f"Training {ml_model}") -# for method in dataset_qualities: -# print(f"for {method}") -# train_model(ml_model, method) +if train_the_models: + for ml_model in ml_models: + print(f"Training {ml_model}") + for method in dataset_qualities: + print(f"for {method}") + train_model(ml_model, method) training_method = 'augmented' testing_method = 'augmented' first_time = 1 diff --git a/main_heuristics.py b/main_heuristics.py index fc5730a..2853150 100644 --- a/main_heuristics.py +++ b/main_heuristics.py @@ -3,13 +3,15 @@ import pickle import random # import numpy as np -# from Heuristics.heuristics_guess import not_greedy_heuristic_guess -# from Heuristics.heuristics_guess import choose_order_given_projections +from Heuristics.heuristics_guess import not_greedy_heuristic_guess +from Heuristics.heuristics_guess import choose_order_given_projections from find_filename import find_dataset_filename from test_models import compute_metrics +random.seed(0) + nvar = 3 -testing_method = 'Augmented' +testing_method = 'Normal' test_dataset_filename = find_dataset_filename('Test', testing_method) with open(test_dataset_filename, 'rb') as test_dataset_file: @@ -20,10 +22,10 @@ # TESTING GMODS IN AUUGMENTED : Features 2, 67 and 132 def choose_gmods(features): a = [] - # print(features) - a.append(features[2]) - a.append(features[67]) - a.append(features[132]) + # # print(features) + # a.append(features[2]) + # a.append(features[67]) + # a.append(features[132]) if a[0]==min(a): if a[1]<=a[2]: return 0 @@ -40,33 +42,35 @@ def choose_gmods(features): else: return 5 + # Testing in heuristics that make all the choice at once first_heuristic = 1 -# for heuristic in ['T1', 'gmods', 'brown', 'random', 'virtual best']: -for heuristic in ['gmods', 'virtual best']: - reps = 10 +for heuristic in ['T1', 'gmods', 'brown', 'random', 'virtual-best']: +# for heuristic in ['gmods', 'virtual best']: + reps = 100 sum_metrics = dict() for i in range(reps): - if heuristic == 'virtual best': + if heuristic == 'virtual-best': # chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']] chosen_indices = testing_dataset['labels'] elif heuristic == 'random': chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']] else: - # chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic) - # for projection in testing_dataset['projections']] - chosen_indices = [choose_gmods(features) - for features in testing_dataset['features']] + chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic) + for projection in testing_dataset['projections']] + # chosen_indices = [choose_gmods(features) + # for features in testing_dataset['features']] metrics = compute_metrics(chosen_indices, testing_dataset['labels'], testing_dataset['timings'], - testing_dataset['cells']) + testing_dataset['cells'], + heuristic) if len(sum_metrics) == 0: sum_metrics = metrics else: sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics} aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} - augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(1)*aveg_metrics[key] for key in sum_metrics} + augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics} print(heuristic, augmented_metrics) if first_heuristic == 1: diff --git a/make_plots.py b/make_plots.py index 7967127..8b04352 100644 --- a/make_plots.py +++ b/make_plots.py @@ -1,10 +1,12 @@ """Make some plots""" import os +import pickle import numpy as np from numpy import sort import matplotlib import matplotlib.pyplot as plt from matplotlib.pyplot import cm +from find_filename import find_timings_lists matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans' matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic' matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold' @@ -28,7 +30,7 @@ def survival_plot(timings: dict, plot_name="survival_plot"): style = ['--'] * len(timings) dashes = [(1, 0), (5, 1), (5, 1, 1, 1), (2, 1, 2, 1), (1, 1), (5, 5)]\ + [(1, 0)] * len(timings) - + for method, c, s, d in zip(timings, color, style, dashes): not_timeout_timings = [timing for timing in timings[method] if timing != 30 and timing != 60] @@ -46,3 +48,52 @@ def survival_plot(timings: dict, plot_name="survival_plot"): f'{plot_name}.png') plt.savefig(figure_location) plt.cla() + + +def create_adversarial_plot( + model1='RF', + model2='RFR' +): + ''' + This function creates an adversarial plot comparing the desired models. + ''' + + timings_lists_filename = find_timings_lists(model1) + with open(timings_lists_filename, 'rb') as timings_lists_file: + rawtimings1 = pickle.load(timings_lists_file) + timings1 = [80 if timing == 60 else timing for timing in rawtimings1] + + timings_lists_filename = find_timings_lists(model2) + with open(timings_lists_filename, 'rb') as timings_lists_file: + rawtimings2 = pickle.load(timings_lists_file) + timings2 = [80 if timing == 60 else timing for timing in rawtimings2] + plot, ax = plt.subplots(1, 1) + + # Set number of ticks for x-axis + ticks = list(np.arange(0, 90, 10)) + ticks.pop(-2) + ax.set_xticks(ticks) + ax.set_yticks(ticks) + # Set ticks labels for x-axis + ticks_labels = ticks + ticks_labels[-1] = 'Timeout' + ax.set_xticklabels(ticks_labels, fontsize=fontsize) + ax.set_yticklabels(ticks_labels, rotation='vertical', fontsize=fontsize) + + # plotting + ax.plot(timings1, timings2, '.') + ax.plot([0, 90], [0, 90], '-') + + # creating labels + plt.xlabel(model1, **desired_font, fontsize=fontsize-2) + plt.ylabel(model2, **desired_font, fontsize=fontsize-2) + + plt.title('Adversarial plot comparing ' + model1 + ' and ' + model2) + figure_location = os.path.join(os.path.dirname(__file__), 'Art', + 'adversarial_plot_' + model1 + + '_vs_' + model2 + '.png') + plt.savefig(figure_location) + plt.cla() + + +create_adversarial_plot() diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index a5ff0e0..ae145aa 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -6,15 +6,7 @@ import itertools # from xml.sax.handler import all_features import numpy as np - - -def aveg(given_list): - return sum(given_list)/len(given_list) - - -def aveg_not_zero(given_list): - return sum(given_list)/max(1, len([1 for elem in given_list - if elem != 0])) +from config.general_values import operations def identity(input): @@ -35,8 +27,7 @@ def sign(input): raise Exception("How is this possible?") -def create_features(degrees, variable=0, sv=False, - operations=[sum, max, aveg, aveg_not_zero]): +def create_features(degrees, variable=0, sv=False): sign_or_not = [identity, sign] features = [] features_names = [] @@ -66,6 +57,8 @@ def extract_features(dataset): all_original_polynomials = [] all_projections = [] all_cells = [] + for index, elem in enumerate(dataset): + print(index, elem[0]) for index, projections in enumerate(dataset[0]): all_projections.append(projections) original_polynomials = projections[0][0] @@ -74,11 +67,9 @@ def extract_features(dataset): all_original_polynomials.append(original_polynomials) all_labels.append(dataset[1][index]) all_timings.append(dataset[2][index]) - all_cells.append(dataset[3][index]) + all_cells.append(dataset[4][index]) names, instance_features = \ - features_from_set_of_polys( - original_polynomials, - operations=[sum, max, aveg, aveg_not_zero]) + features_from_set_of_polys(original_polynomials) all_features.append(instance_features) my_dataset['polynomials'] = np.array(all_original_polynomials) my_dataset['names'] = np.array(names) @@ -90,38 +81,33 @@ def extract_features(dataset): return my_dataset -def features_from_set_of_polys(original_polynomials, - operations=[sum, max, aveg, aveg_not_zero]): +def features_from_set_of_polys(original_polynomials): instance_features = [] names = [] nvar = len(original_polynomials[0][0]) - 1 for var in range(nvar): var_features, var_names = \ compute_features_for_var(original_polynomials, - var, - operations=operations) + var) instance_features += var_features names += var_names return names, instance_features -def compute_features_for_var(original_polynomials, var, - operations=[sum, max, aveg]): +def compute_features_for_var(original_polynomials, var): '''Given polynomials and a variable computes the features''' degrees = [[monomial[var] for monomial in poly] for poly in original_polynomials] var_features, var_features_names = \ create_features(degrees, - variable=var, - operations=operations) + variable=var) sdegrees = \ [[sum(monomial[:-1]) for monomial in poly if monomial[var] != 0] + [0] for poly in original_polynomials] svar_features, svar_features_names = \ create_features(sdegrees, variable=var, - sv=True, - operations=operations) + sv=True) var_names = var_features_names + svar_features_names var_features = var_features + svar_features return var_features, var_names diff --git a/test_models.py b/test_models.py index a9ad961..92ff35d 100644 --- a/test_models.py +++ b/test_models.py @@ -10,7 +10,9 @@ from find_filename import find_output_filename from find_filename import find_dataset_filename from find_filename import find_model_filename -from train_models import ordering_choice_reinforcement +from find_filename import find_timings_lists +# from train_models import ordering_choice_reinforcement +# from train_models import training_instances_reinforcement # Check if 'dataset_manipulation' is installed if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): from dataset_manipulation import augmentate_instance @@ -41,9 +43,6 @@ def test_results(training_method): testing_method) accuracy[testing_method] = test_model(trained_model_filename, test_dataset_filename) - print('testing_method', testing_method) - print('ml_model', ml_model) - print('acc', accuracy[testing_method]) round_accuracies = [round(acc, 2) for acc in [accuracy[method] for method in dataset_qualities]] @@ -100,48 +99,54 @@ def test_regressor(ml_model): def test_model(ml_model, paradigm, testing_method='augmented'): trained_model_filename = find_model_filename(paradigm, ml_model) - print(trained_model_filename, paradigm, ml_model) + # print(trained_model_filename, paradigm, ml_model) test_dataset_filename = find_dataset_filename('Test', testing_method) with open(trained_model_filename, 'rb') as trained_model_file: model = pickle.load(trained_model_file) with open(test_dataset_filename, 'rb') as test_dataset_file: testing_dataset = pickle.load(test_dataset_file) - print("here") if ml_model in ml_regressors and paradigm == 'regression': chosen_indices = [return_regressor_choice(model, features) for features in testing_dataset['features']] elif ml_model in ml_models: + # print('testing_method', testing_method) chosen_indices = [model.predict([features])[0] for features in testing_dataset['features']] elif paradigm == 'reinforcement' and testing_method == 'Normal': chosen_indices = [ordering_choice_reinforcement(model, projections) for projections in testing_dataset['projections']] - print(chosen_indices) - print("here2") + # print(chosen_indices) + # print("here2") return compute_metrics(chosen_indices, testing_dataset['labels'], testing_dataset['timings'], - testing_dataset['cells']) + testing_dataset['cells'], + ml_model) -def compute_metrics(chosen_indices, labels, all_timings, all_cells): +def compute_metrics(chosen_indices, labels, all_timings, all_cells, model): metrics = dict() correct = 0 - metrics['Total time'] = 0 + metrics['TotalTime'] = 0 total_markup = 0 metrics['Completed'] = 0 - metrics['Total cells'] = 0 + metrics['TotalCells'] = 0 + # REmove follwoign loop for chosen_index, label, timings, cells in \ zip(chosen_indices, labels, all_timings, all_cells): if chosen_index == label: correct += 1 - print(timings, chosen_index) if timings[chosen_index] not in [30, 60]: metrics['Completed'] += 1 - metrics['Total time'] += timings[chosen_index] total_markup += (timings[chosen_index]-timings[label])/(timings[label] + 1) - metrics['Total cells'] += cells[chosen_index] + metrics['TotalCells'] += cells[chosen_index] + chosen_times = [timings[index] for index, timings + in zip(chosen_indices, all_timings)] + timings_lists_filename = find_timings_lists(model) + with open(timings_lists_filename, 'wb') as timings_lists_file: + pickle.dump(chosen_times, timings_lists_file) + metrics['TotalTime'] = sum(chosen_times) total_instances = len(chosen_indices) metrics['Accuracy'] = correct/total_instances metrics['Markup'] = total_markup/total_instances @@ -149,7 +154,7 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells): def return_regressor_choice(model, features): - nvar = 3 ## Make this better + nvar = 3 # Make this better made_up_timings = list(range(math.factorial(nvar))) made_up_cells = list(range(math.factorial(nvar))) augmentated_features, _, _ = \ diff --git a/test_train_datasets.py b/test_train_datasets.py index fb3bf71..7279244 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -73,6 +73,7 @@ def create_train_test_datasets(): *[datasets[f'{purpose}_Normal'][key2] for key2 in keys], nvar=3)) } + print(f"features in {purpose}_Augmented", len(datasets[f'{purpose}_Augmented']['features'][0])) datasets[f'{purpose}_Augmented']['labels'] = \ [timings.index(min(timings)) for timings in datasets[f'{purpose}_Augmented']['timings']] for purpose in purposes: @@ -135,4 +136,6 @@ def create_regression_datasets(taking_logarithms=True): # print(classification_dataset['labels']) -create_regression_datasets(taking_logarithms=False) \ No newline at end of file +# create_regression_datasets(taking_logarithms=False) + +# create_train_test_datasets() \ No newline at end of file diff --git a/train_models.py b/train_models.py index e195c72..5aeede2 100644 --- a/train_models.py +++ b/train_models.py @@ -1,5 +1,6 @@ import math import pickle +import random from yaml_tools import read_yaml_from_file from config.ml_models import sklearn_models from config.ml_models import ml_regressors @@ -12,6 +13,7 @@ from sklearn import metrics from itertools import combinations from replicating_Dorians_features import compute_features_for_var +from test_models import compute_metrics def train_model(ml_model, method): @@ -67,12 +69,6 @@ def test_regression_model(method, regressor): y_pred = [choose_using_regression(x_i, regressor) for x_i in x_test] -# for ml_reg in ml_regressors: -# print(ml_reg) -# regressor = train_regression_model(ml_reg, 'balanced') -# print(ml_reg) -# test_regression_model('balanced', regressor) - def train_reinforcement_model(ml_model, method='Normal'): train_data_filename = find_dataset_filename('Train', method=method) with open(train_data_filename, 'rb') as train_data_file: @@ -84,13 +80,18 @@ def train_reinforcement_model(ml_model, method='Normal'): model = current_model() first_polys = train_dataset['projections'][0][0][0] first_features = get_vars_features(first_polys) - first_labels = [1]*len(first_features) + first_labels = [random.random() for _ in range([len(first_features)])] model.fit(first_features, first_labels) - for projections, timings \ - in zip(train_dataset['projections'], train_dataset['timings']): - training_features, training_labels = \ - training_instances_reinforcement(model, projections, timings) + training_features, training_labels = [], [] + for i in range(30): + for projections, timings \ + in zip(train_dataset['projections'], train_dataset['timings']): + new_training_features, new_training_labels = \ + training_instances_reinforcement(model, projections, timings) + training_features += new_training_features + training_labels += new_training_labels model.fit(training_features, training_labels) + print(test_reinforcement_model(model)) trained_model_filename = find_model_filename('reinforcement', ml_model) with open(trained_model_filename, 'wb') as trained_model_file: pickle.dump(model, trained_model_file) @@ -100,8 +101,6 @@ def training_instances_reinforcement(model, projections, timings): original_polynomials = projections[0][0] nvar = len(original_polynomials[0][0]) - 1 vars_features = get_vars_features(original_polynomials) - print(len(vars_features[0])) - print(model.predict([vars_features[0]])) evaluations = [model.predict([var_features])[0] for var_features in vars_features] timing = [] @@ -119,7 +118,7 @@ def training_instances_reinforcement(model, projections, timings): pairs = list(combinations(range(nvar), 2)) for i, j in pairs: correction_coefficient = \ - math.sqrt((timing[j]/timing[j])/(evaluations[i]/evaluations[j])) + math.sqrt((timing[i]/timing[j])/(evaluations[i]/evaluations[j])) instances_features += [vars_features[i], vars_features[j]] instances_labels += [evaluations[i]*correction_coefficient, evaluations[j]/correction_coefficient] @@ -131,15 +130,12 @@ def get_vars_features(polynomials): in the given set of polynomials''' vars_features = [] nvar = len(polynomials[0][0]) - 1 - print('number of variabels', nvar) unique_features_filename = find_other_filename("unique_features") with open(unique_features_filename, 'rb') as unique_features_file: unique_names = pickle.load(unique_features_file) for var in range(nvar): - print('variabel', var) var_features, var_names = \ compute_features_for_var(polynomials, var) - print('var_features', var_features) var_features = [feature for feature, name in zip(var_features, var_names) if name in unique_names] @@ -151,8 +147,6 @@ def var_choice_reinforcement(model, polynomials): '''This function will return the next variable to project chosen by the model trained using reinforcement''' vars_features = get_vars_features(polynomials) - print(polynomials) - print(len(vars_features), len(vars_features[0]), '\n', vars_features) evaluations = model.predict(vars_features) return np.argmin(evaluations) @@ -168,4 +162,22 @@ def ordering_choice_reinforcement(model, projections): return ordering -train_reinforcement_model('RFR') +def test_reinforcement_model(ml_model, method='Normal', nvar=3): + train_data_filename = find_dataset_filename('Test', method=method) + with open(train_data_filename, 'rb') as train_data_file: + testing_dataset = pickle.load(train_data_file) + # trained_model_filename = find_model_filename('reinforcement', ml_model) + # with open(trained_model_filename, 'rb') as trained_model_file: + # model = pickle.load(trained_model_file) + model = ml_model + chosen_indices = [ordering_choice_reinforcement(model, projections) + for projections in testing_dataset['projections']] + metrics = compute_metrics(chosen_indices, + testing_dataset['labels'], + testing_dataset['timings'], + testing_dataset['cells'], + 'reinfocement') + augmented_metrics = {key: metrics[key] if key in ['Accuracy', 'Markup'] + else math.factorial(nvar)*metrics[key] + for key in metrics} + return augmented_metrics