diff --git a/choose_hyperparams.py b/choose_hyperparams.py index f3beb66..b9570df 100644 --- a/choose_hyperparams.py +++ b/choose_hyperparams.py @@ -28,16 +28,16 @@ def k_folds_ml(x_train, y_train, model, random_state=0): return rf_cv.best_params_ -def choose_hyperparams(ml_model, method): +def choose_hyperparams(model_name, paradigm, training_quality): """Given a ml_model and a method, a file with the hyperparameters chosen by cross validation is created""" - this_dataset_file = find_dataset_filename('Train', method=method) + this_dataset_file = find_dataset_filename('Train', dataset_quality=training_quality) with open(this_dataset_file, 'rb') as f: dataset = pickle.load(f) - hyperparams = k_folds_ml(dataset['features'], dataset['labels'], model=ml_model) + hyperparams = k_folds_ml(dataset['features'], dataset['labels'], model=model_name) print(hyperparams) - hyperparams_filename = find_hyperparams_filename(method, ml_model) - print(hyperparams_filename) + hyperparams_filename = find_hyperparams_filename(model_name, paradigm, training_quality) + print('new hyperparams_filename', hyperparams_filename) write_yaml_to_file(hyperparams, hyperparams_filename) diff --git a/config/general_values.py b/config/general_values.py index f9a5d8d..6cf50b5 100644 --- a/config/general_values.py +++ b/config/general_values.py @@ -1,6 +1,6 @@ purposes = ['Train', 'Test'] -dataset_qualities = ['Normal', 'Balanced', 'Augmented'] +dataset_qualities = ['Biased', 'Balanced', 'Augmented'] def aveg(given_list): diff --git a/config/hyperparameters_grid.py b/config/hyperparameters_grid.py index 6f30462..d3aa5e2 100644 --- a/config/hyperparameters_grid.py +++ b/config/hyperparameters_grid.py @@ -2,9 +2,9 @@ grid = dict() grid['RF-Classifier'] = { - 'n_estimators': [200, 300, 400, 500], + 'n_estimators': [200, 500], 'max_features': ['sqrt', 'log2'], - 'max_depth': [4, 5, 6, 7, 8], + 'max_depth': [4, 6, 8], 'criterion': ['gini', 'entropy'] } grid['KNN-Classifier'] = { @@ -15,8 +15,7 @@ # 'p': range(1, 4, 1) } grid['MLP-Classifier'] = { - 'hidden_layer_sizes': [(5, 5), (15, 15), (20, 20), - (10, 10, 10), (20, 20, 20)], + 'hidden_layer_sizes': [(30, 30), (10, 10, 10), (20, 20, 20)], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 'learning_rate': ['constant', 'adaptive'], @@ -43,9 +42,13 @@ } grid['RF-Regressor'] = { - 'criterion': ['squared_error', 'friedman_mse'], - "max_depth": [1, 3, 7], - "min_samples_leaf": [1, 5, 10], + 'n_estimators': [200, 500], + 'max_features': ['sqrt', 'log2'], + 'max_depth': [4, 6, 8], + 'criterion': ['friedman_mse', 'squared_error'] + # 'criterion': ['squared_error', 'friedman_mse'], + # "max_depth": [1, 3, 7], + # "min_samples_leaf": [1, 5, 10], } grid['KNN-Regressor'] = { 'n_neighbors': [3, 5, 10], @@ -53,10 +56,16 @@ 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } grid['MLP-Regressor'] = { - 'hidden_layer_sizes': [(100,), (20, 20), (10, 10, 10)], - 'activation': ['logistic', 'tanh', 'relu'], - 'solver': ['adam', 'sgd'], - 'alpha': [0.0001, 0.001, 0.01] + 'hidden_layer_sizes': [(30, 30), (10, 10, 10), (20, 20, 20)], + 'activation': ['tanh', 'relu'], + 'solver': ['sgd', 'adam'], + 'learning_rate': ['constant', 'adaptive'], + 'alpha': [0.05, 0.005], + 'max_iter': [1000] + # 'hidden_layer_sizes': [(30, 30), (20, 20, 20), (10, 10, 10)], + # 'activation': ['logistic', 'tanh', 'relu'], + # 'solver': ['adam', 'sgd'], + # 'alpha': [0.0001, 0.001, 0.01] } grid['DT-Regressor'] = { "splitter": ["best", "random"], diff --git a/config/ml_models.py b/config/ml_models.py index 5a35d1b..d1778cd 100644 --- a/config/ml_models.py +++ b/config/ml_models.py @@ -35,4 +35,4 @@ all_models = {**classifiers, **regressors} -heuristics = []#'T1', 'gmods', 'brown', 'random', 'virtual-best'] +heuristics = ['T1', 'gmods', 'brown', 'random', 'virtual-best'] diff --git a/create_clean_dataset.py b/create_clean_dataset.py index ec56e2f..4651870 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -83,7 +83,6 @@ def cleaning_dataset(): def convert_to_timing(timing_str, penalization=2): if not contains_float(timing_str): - print(penalization * float(timing_str[5:])) return penalization * float(timing_str[5:]) return float(timing_str) @@ -109,4 +108,4 @@ def contains_int(input_str): match = re.match(int_pattern, input_str) return match is not None -cleaning_dataset() \ No newline at end of file +# cleaning_dataset() diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt index 2a4b33d..5242697 100644 Binary files a/datasets/clean_dataset.txt and b/datasets/clean_dataset.txt differ diff --git a/find_filename.py b/find_filename.py index bd62c25..aa6a9ac 100644 --- a/find_filename.py +++ b/find_filename.py @@ -15,7 +15,7 @@ def find_model_filename(model_name, paradigm, training_quality): f'{model_name}-{paradigm}-{training_quality}.txt') -def find_dataset_filename(purpose, method=None): +def find_dataset_filename(purpose, dataset_quality=None, paradigm=''): if purpose == "unclean": return os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', @@ -34,7 +34,7 @@ def find_dataset_filename(purpose, method=None): elif purpose in purposes: return os.path.join(os.path.dirname(__file__), 'datasets', f'{purpose}', - f'{method}_{purpose}_dataset.txt') + f'{dataset_quality}-{purpose}-{paradigm}-dataset.txt') else: raise Exception(f"Purpose {purpose} not found") diff --git a/main.py b/main.py index 25c62af..fb1f5a8 100644 --- a/main.py +++ b/main.py @@ -34,6 +34,7 @@ train_the_models = True paradigm = 'classification' +print("MAIN.PY") cleaning_dataset() create_train_test_datasets() diff --git a/main_heuristics.py b/main_heuristics.py index 462b341..da50ea8 100644 --- a/main_heuristics.py +++ b/main_heuristics.py @@ -5,14 +5,14 @@ # import numpy as np from Heuristics.heuristics_guess import not_greedy_heuristic_guess from Heuristics.heuristics_guess import ordering_given_projections -from find_filename import find_dataset_filename -from test_models import compute_metrics -from config.ml_models import heuristics +# from find_filename import find_dataset_filename +# from test_models import compute_metrics +# from config.ml_models import heuristics random.seed(0) nvar = 3 -testing_method = 'Normal' +testing_method = 'Biased' # # TESTING GMODS IN AUUGMENTED : Features 2, 67 and 132 # def choose_gmods(features): @@ -38,61 +38,63 @@ # return 5 -def ordering_choices_heuristics(heuristic, testing_dataset, greedy=False): +def ordering_choices_heuristics(heuristic, testing_dataset, paradigm): if heuristic == 'virtual-best': chosen_indices = testing_dataset['labels'] elif heuristic == 'random': chosen_indices = [random.randint(0, len(timings)-1) for timings in testing_dataset['timings']] else: - if greedy: + if paradigm == 'Greedy': chosen_indices = [ordering_given_projections(projection, heuristic) for projection in testing_dataset['projections']] - else: - chosen_indices = [not_greedy_heuristic_guess(projection[0][0], + elif paradigm == 'NotGreedy': + chosen_indices = [not_greedy_heuristic_guess(polynomials, heuristic) - for projection in testing_dataset['projections']] + for polynomials in testing_dataset['polynomials']] + else: + raise Exception(f"Paradigm {paradigm} not recognised for a heuristic.") return chosen_indices -if __name__ == "__main__": - test_dataset_filename = find_dataset_filename('Test', - testing_method) - with open(test_dataset_filename, 'rb') as test_dataset_file: - testing_dataset = pickle.load(test_dataset_file) - output_file = "heuristics_output_acc_time.csv" +# if __name__ == "__main__": +# test_dataset_filename = find_dataset_filename('Test', +# testing_method) +# with open(test_dataset_filename, 'rb') as test_dataset_file: +# testing_dataset = pickle.load(test_dataset_file) +# output_file = "heuristics_output_acc_time.csv" - # Testing in heuristics that make all the choice at once - first_heuristic = 1 - for greedy in [True, False]: - for heuristic in heuristics: - # for heuristic in ['gmods', 'virtual best']: - reps = 100 - for i in range(reps): - chosen_indices = ordering_choices_heuristics(heuristic, - testing_dataset, - greedy=greedy) - metrics = compute_metrics(chosen_indices, - testing_dataset) - if i == 0: - sum_metrics = metrics - else: - sum_metrics = {key: metrics[key] + sum_metrics[key] - for key in metrics} - aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} - augmented_metrics = {key: aveg_metrics[key] - if key in ['Accuracy', 'Markup'] - else math.factorial(nvar)*aveg_metrics[key] - for key in sum_metrics} +# # Testing in heuristics that make all the choice at once +# first_heuristic = 1 +# for greedy in [True, False]: +# for heuristic in heuristics: +# # for heuristic in ['gmods', 'virtual best']: +# reps = 100 +# for i in range(reps): +# chosen_indices = ordering_choices_heuristics(heuristic, +# testing_dataset, +# greedy=greedy) +# metrics = compute_metrics(chosen_indices, +# testing_dataset) +# if i == 0: +# sum_metrics = metrics +# else: +# sum_metrics = {key: metrics[key] + sum_metrics[key] +# for key in metrics} +# aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics} +# augmented_metrics = {key: aveg_metrics[key] +# if key in ['Accuracy', 'Markup'] +# else math.factorial(nvar)*aveg_metrics[key] +# for key in sum_metrics} - print('not-'*(not greedy) + 'greedy-' + heuristic, - augmented_metrics) - if first_heuristic == 1: - first_heuristic = 0 - keys = list(augmented_metrics.keys()) - with open(output_file, 'a') as f: - f.write(', '.join(['Model'] + keys) + '\n') - with open(output_file, 'a', newline='') as f: - writer = csv.writer(f) - writer.writerow(['not-'*(not greedy) + 'greedy-' + heuristic] - + [augmented_metrics[key] for key in keys]) +# print('not-'*(not greedy) + 'greedy-' + heuristic, +# augmented_metrics) +# if first_heuristic == 1: +# first_heuristic = 0 +# keys = list(augmented_metrics.keys()) +# with open(output_file, 'a') as f: +# f.write(', '.join(['Model'] + keys) + '\n') +# with open(output_file, 'a', newline='') as f: +# writer = csv.writer(f) +# writer.writerow(['not-'*(not greedy) + 'greedy-' + heuristic] +# + [augmented_metrics[key] for key in keys]) diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index 832c44b..bf1cf44 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -69,13 +69,14 @@ def extract_features(dataset): names, instance_features = \ features_from_set_of_polys(original_polynomials) all_features.append(instance_features) - my_dataset['polynomials'] = np.array(all_original_polynomials) + my_dataset['polynomials'] = all_original_polynomials my_dataset['names'] = np.array(names) my_dataset['features'] = np.array(all_features) my_dataset['labels'] = np.array(all_labels) my_dataset['timings'] = np.array(all_timings) - my_dataset['projections'] = np.array(all_projections) + my_dataset['projections'] = all_projections my_dataset['cells'] = np.array(all_cells) + # all these use to be converted to np.array() return my_dataset diff --git a/run_for_paper.py b/run_for_paper.py new file mode 100644 index 0000000..6af4d39 --- /dev/null +++ b/run_for_paper.py @@ -0,0 +1,174 @@ +import os +import pickle +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +from create_clean_dataset import cleaning_dataset +from test_train_datasets import create_train_test_datasets +from test_train_datasets import create_regression_datasets +from config.ml_models import all_models +from config.ml_models import regressors +from config.ml_models import classifiers +from config.ml_models import heuristics +from choose_hyperparams import choose_hyperparams +from train_models import train_model +from main_heuristics import ordering_choices_heuristics +from find_filename import find_dataset_filename +from find_filename import find_timings_lists +from find_filename import find_hyperparams_filename +from test_models import compute_metrics +from test_models import choose_indices + + +def metrics_for_all_reps(all_indices_chosen, testing_dataset, ml_model): + all_metrics = [compute_metrics(chosen_indices, testing_dataset) + for chosen_indices in all_indices_chosen] + aveg_metrics = {key: sum(metrics[key]/len(all_metrics) + for metrics in all_metrics) + for key in all_metrics[0]} + all_timings = testing_dataset['timings'] + aveg_timings = [] + for instance in range(len(all_indices_chosen[0])): + instance_timings = [timings[indices_chosen[instance]] + for timings, indices_chosen + in zip(all_timings, all_indices_chosen)] + aveg_timings.append(instance_timings) + timings_lists_filename = find_timings_lists(ml_model) + with open(timings_lists_filename, 'wb') as timings_lists_file: + pickle.dump(aveg_timings, timings_lists_file) + all_total_times = [metrics['TotalTime'] for metrics in all_metrics] + return aveg_metrics, all_total_times + + +def dominiks_plots(all_total_times): + data = [] + for key in all_total_times: + data.extend([{'Model': key, 'Total time': total_time} + for total_time in all_total_times[key]]) + df = pd.DataFrame(data) + + # Create a box plot + plt.figure(figsize=(8, 6)) + sns.boxplot(x='Model', y='Total time', data=df) + + # Add labels and title + plt.xlabel('Model') + plt.ylabel('Total time') + plt.title('Model Total time Comparison') + + # Display the plot + plt.show() + + +def repeat_instances_dataset(dataset, n_reps): + new_dataset = dict() + for key in dataset: + new_dataset[key] = [elem for elem in dataset[key] + for _ in range(n_reps)] + return new_dataset + + +def study_a_model(model_name: str, + testing_quality: str, + paradigm: str, + training_quality: str = '', + tune_hyperparameters: bool = False, + reps: int = 10 + ): + if model_name in heuristics: + if training_quality != '': + raise Exception(f"training_quality cannot be {training_quality}.") + if tune_hyperparameters is not False: + raise Exception(f"Hyperparams cannot be tuned for {paradigm}.") + testing_filename = find_dataset_filename('Test', testing_quality) + with open(testing_filename, 'rb') as testing_file: + testing_dataset = pickle.load(testing_file) + factorial_nvar = len(testing_dataset['projections'][0]) + if testing_quality in ['Biased', 'Balanced']: + # If the dataset contains less factorial_nvar less instances, + # we repeat each instance factorial_nvar times + testing_dataset = \ + repeat_instances_dataset(testing_dataset, factorial_nvar) + all_metrics = [] + all_timings = [] + for _ in range(reps): + if model_name not in heuristics: + # If the paradigm is 'Heuristics' there is no need + # to tune_hyperparameters or to train the models + hyperparams_filename = find_hyperparams_filename(model_name, + paradigm, + training_quality) + '.yaml' + if tune_hyperparameters or not os.path.exists(hyperparams_filename): + if not os.path.exists(hyperparams_filename): + print('hyperparams_filename doesnt exits \n', hyperparams_filename) + choose_hyperparams(model_name, paradigm, training_quality) + # Hyperparameters ready + train_model(model_name, paradigm, training_quality) + # Model trained + chosen_indices = choose_indices(model_name, testing_dataset, + paradigm, training_quality) + # Indices chosen by the model + all_metrics.append(compute_metrics(chosen_indices, testing_dataset)) + all_timings.append([timings[index] for timings, index + in zip(testing_dataset['timings'], + chosen_indices)]) + model_info = dict() + model_info['AverageMetrics'] = {key: sum(metrics[key] for metrics + in all_metrics)/reps + for key in all_metrics[0]} + # average metrics computed for comparison purposes + model_info['AverageTimings'] = [sum(all_timings_in_instance)/reps + for all_timings_in_instance + in zip(*all_timings)] + # average timings in each instance to create adversarial plots + for key in all_metrics[0]: + model_info['All' + key] = [metrics[key] + for metrics in all_metrics] + # info of all metrics saved for seaborn boxplots + return model_info + + +if __name__ == "__main__": + reps = 50 + data = dict() + data['TotalTime'] = [] + new_datasets = False + if new_datasets: + cleaning_dataset() + create_train_test_datasets() + create_regression_datasets() + all_total_times = dict() + for model_name in list(all_models) + heuristics: + if model_name in heuristics: + testing_quality = 'Biased' + training_quality = '' + tune_hyperparameters = False + paradigm = 'Greedy' # NotGreedy + else: + testing_quality = 'Augmented' + training_quality = 'Augmented' + tune_hyperparameters = False + if model_name in classifiers: + paradigm = '' + elif model_name in regressors: + paradigm = 'Regression' + + model_info = study_a_model(model_name=model_name, + testing_quality=testing_quality, + paradigm=paradigm, + training_quality=training_quality, + tune_hyperparameters=tune_hyperparameters, + reps=reps + ) + all_total_times[model_name] = model_info['AllTotalTime'] + + dominiks_plots(all_total_times) + + + + +# def choose_indices(model, dataset): +# if model in classifiers: +# elif model in heuristics: +# ordering_choices_heuristics(model, dataset) diff --git a/test_models.py b/test_models.py index ef32a4b..662c82a 100644 --- a/test_models.py +++ b/test_models.py @@ -12,7 +12,7 @@ from find_filename import find_output_filename from find_filename import find_dataset_filename from find_filename import find_model_filename -from find_filename import find_timings_lists +from main_heuristics import ordering_choices_heuristics # from train_models import ordering_choice_reinforcement # from train_models import training_instances_reinforcement # Check if 'dataset_manipulation' is installed @@ -52,8 +52,8 @@ def test_results(training_method): round_accuracies) -def test_classifier(ml_model, testing_method='augmented'): - trained_model_filename = find_model_filename('classification', +def test_classifier(ml_model, testing_method='Augmented'): + trained_model_filename = find_model_filename('Classification', ml_model) test_dataset_filename = find_dataset_filename('Test', testing_method) @@ -65,8 +65,8 @@ def test_classifier(ml_model, testing_method='augmented'): return compute_metrics(chosen_indices, y_test, all_timings) -def timings_in_test(model, testing_method='augmented', training_method=None): - test_dataset_filename = find_dataset_filename('test', +def timings_in_test(model, testing_method='Augmented', training_method=None): + test_dataset_filename = find_dataset_filename('Test', testing_method) with open(test_dataset_filename, 'rb') as test_dataset_file: x_test, _, all_timings = pickle.load(test_dataset_file) @@ -85,10 +85,10 @@ def timings_in_test(model, testing_method='augmented', training_method=None): def test_regressor(ml_model): - trained_model_filename = find_model_filename('regression', + trained_model_filename = find_model_filename('Regression', ml_model) - test_dataset_filename = find_dataset_filename('test', - 'regression') + test_dataset_filename = find_dataset_filename('Test', + 'Regression') with open(trained_model_filename, 'rb') as trained_model_file: model = pickle.load(trained_model_file) with open(test_dataset_filename, 'rb') as test_dataset_file: @@ -98,7 +98,7 @@ def test_regressor(ml_model): print(f"{ml_model} gave {avg_error}") -def test_model(ml_model, paradigm, testing_method='augmented'): +def test_model(ml_model, paradigm, testing_method='Augmented'): test_dataset_filename = find_dataset_filename('Test', testing_method) with open(test_dataset_filename, 'rb') as test_dataset_file: @@ -115,21 +115,22 @@ def test_model(ml_model, paradigm, testing_method='augmented'): ml_model) -def choose_indices(ml_model, dataset, paradigm=''): - trained_model_filename = find_model_filename(paradigm, ml_model) - with open(trained_model_filename, 'rb') as trained_model_file: - model = pickle.load(trained_model_file) - if ml_model in regressors: - chosen_indices = [return_regressor_choice(model, features) - for features in dataset['features']] - elif ml_model in classifiers: - chosen_indices = [model.predict([features])[0] - for features in dataset['features']] - elif paradigm == 'reinforcement': - chosen_indices = [ordering_choice_reinforcement(model, projections) - for projections in dataset['projections']] - elif ml_model in heuristics: - ordering_choices_heuristics(model, dataset) +def choose_indices(model_name, testing_dataset, paradigm='', training_quality='Augmented'): + if model_name in heuristics: + chosen_indices = ordering_choices_heuristics(model_name, testing_dataset, paradigm) + else: + trained_model_filename = find_model_filename(model_name, paradigm, training_quality) + with open(trained_model_filename, 'rb') as trained_model_file: + model = pickle.load(trained_model_file) + if model_name in regressors: + chosen_indices = [return_regressor_choice(model, features) + for features in testing_dataset['features']] + elif model_name in classifiers: + chosen_indices = [model.predict([features])[0] + for features in testing_dataset['features']] + elif paradigm == 'Reinforcement': + chosen_indices = [ordering_choice_reinforcement(model, projections) + for projections in testing_dataset['projections']] return chosen_indices diff --git a/test_train_datasets.py b/test_train_datasets.py index 0cc51b1..cdc7c3c 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -40,16 +40,16 @@ def create_train_test_datasets(): # train and test sets are created random_state = 0 print(dataset.keys()) - datasets['Train_Normal']['features'], \ - datasets['Test_Normal']['features'], \ - datasets['Train_Normal']['labels'], \ - datasets['Test_Normal']['labels'], \ - datasets['Train_Normal']['timings'], \ - datasets['Test_Normal']['timings'], \ - datasets['Train_Normal']['projections'], \ - datasets['Test_Normal']['projections'], \ - datasets['Train_Normal']['cells'], \ - datasets['Test_Normal']['cells'] = \ + datasets['Train_Biased']['features'], \ + datasets['Test_Biased']['features'], \ + datasets['Train_Biased']['labels'], \ + datasets['Test_Biased']['labels'], \ + datasets['Train_Biased']['timings'], \ + datasets['Test_Biased']['timings'], \ + datasets['Train_Biased']['projections'], \ + datasets['Test_Biased']['projections'], \ + datasets['Train_Biased']['cells'], \ + datasets['Test_Biased']['cells'] = \ train_test_split(dataset['features'], dataset['labels'], dataset['timings'], @@ -62,7 +62,7 @@ def create_train_test_datasets(): datasets[f'{purpose}_Balanced'] = \ {key: elem for key, elem in zip(keys, balance_dataset( - *[datasets[f'{purpose}_Normal'][key2] + *[datasets[f'{purpose}_Biased'][key2] for key2 in keys], nvar=3)) ##CHOOSE NVAR WELL } datasets[f'{purpose}_Balanced']['labels'] = \ @@ -70,7 +70,7 @@ def create_train_test_datasets(): datasets[f'{purpose}_Augmented'] = \ {key: elem for key, elem in zip(keys, augmentate_dataset( - *[datasets[f'{purpose}_Normal'][key2] + *[datasets[f'{purpose}_Biased'][key2] for key2 in keys], nvar=3)) } print(f"features in {purpose}_Augmented", len(datasets[f'{purpose}_Augmented']['features'][0])) @@ -79,7 +79,7 @@ def create_train_test_datasets(): for purpose in purposes: for quality in dataset_qualities: this_dataset_filename = \ - find_dataset_filename(purpose, method=quality) + find_dataset_filename(purpose, dataset_quality=quality) with open(this_dataset_filename, 'wb') as this_dataset_file: pickle.dump(datasets[purpose + '_' + quality], this_dataset_file) @@ -109,12 +109,14 @@ def create_train_test_datasets(): # + [str(len(y[f'{purpose}_{method}']))]) -def create_regression_datasets(taking_logarithms=True): +def create_regression_datasets(dataset_quality='Augmented', + taking_logarithms=True): for purpose in purposes: - this_dataset_filename = find_dataset_filename(purpose, - method='augmented') + existing_dataset_filename = find_dataset_filename( + purpose, + dataset_quality=dataset_quality) # we will use the augmented dataset here - with open(this_dataset_filename, 'rb') as this_dataset_file: + with open(existing_dataset_filename, 'rb') as this_dataset_file: regression_dataset = pickle.load(this_dataset_file) regression_dataset['labels'] = \ [timings[0] for timings @@ -123,9 +125,11 @@ def create_regression_datasets(taking_logarithms=True): regression_dataset['labels'] = \ [log(label) for label in regression_dataset['labels']] - this_dataset_filename =\ - find_dataset_filename(purpose, method='regression') - with open(this_dataset_filename, 'wb') as this_dataset_file: + new_dataset_filename = find_dataset_filename( + purpose, + dataset_quality=dataset_quality, + paradigm='Regression') + with open(new_dataset_filename, 'wb') as this_dataset_file: pickle.dump(regression_dataset, this_dataset_file) # classification_dataset = regression_dataset # classification_dataset['labels'] = \ diff --git a/train_models.py b/train_models.py index 89f685a..8d6df33 100644 --- a/train_models.py +++ b/train_models.py @@ -15,29 +15,30 @@ from test_models import compute_metrics -def train_model(ml_model, method): - train_data_filename = find_dataset_filename('Train', method=method) - hyperparams_file = find_hyperparams_filename(method, ml_model) +def train_model(model_name, paradigm, training_quality): + train_data_filename = find_dataset_filename('Train', dataset_quality=training_quality, paradigm=paradigm) + print(model_name, 'dataset used for train', train_data_filename) + hyperparams_file = find_hyperparams_filename(model_name, paradigm=paradigm, training_quality=training_quality) with open(train_data_filename, 'rb') as train_data_file: train_dataset = pickle.load(train_data_file) hyperparams = read_yaml_from_file(hyperparams_file) - current_model = all_models[ml_model] + current_model = all_models[model_name] model = current_model(**hyperparams) # model = current_model() print('here') model.fit(train_dataset['features'], train_dataset['labels']) - trained_model_filename = find_model_filename(method, ml_model) + trained_model_filename = find_model_filename(model_name, paradigm, training_quality) print('here2') with open(trained_model_filename, 'wb') as trained_model_file: pickle.dump(model, trained_model_file) return model -def train_regression_model(ml_model, method): +def train_regression_model(model_name, method): train_data_filename = find_dataset_filename('Train', method=method) with open(train_data_filename, 'rb') as train_data_file: train_dataset = pickle.load(train_data_file) - # hyperparams_file = find_hyperparams_filename(method, ml_model) + # hyperparams_file = find_hyperparams_filename(method, model_name) # hyperparams = read_yaml_from_file(hyperparams_file) train_dataset['features'] = np.asarray([x_t for x_t, t_t in zip(train_dataset['features'], train_dataset['timings']) if t_t[:4] != 'Over'], dtype=float) @@ -46,10 +47,10 @@ def train_regression_model(ml_model, method): #### # IS THIS REALLY DOING SOMTHING? # What if we used twice timelimit instead - current_model = ml_regressors[ml_model] + current_model = ml_regressors[model_name] reg = current_model() # **hyperparams) reg.fit(train_dataset['features'], train_dataset['timings']) - # trained_model_filename = find_model_filename(method, ml_model, 'regression') + # trained_model_filename = find_model_filename(method, model_name, 'regression') # with open(trained_model_filename, 'wb') as trained_model_file: # pickle.dump(reg, trained_model_file) return reg @@ -71,13 +72,13 @@ def test_regression_model(method, regressor): y_pred = [choose_using_regression(x_i, regressor) for x_i in x_test] -def train_reinforcement_model(ml_model, method='Normal'): +def train_reinforcement_model(model_name, method='Normal'): train_data_filename = find_dataset_filename('Train', method=method) with open(train_data_filename, 'rb') as train_data_file: train_dataset = pickle.load(train_data_file) - # hyperparams_file = find_hyperparams_filename(method, ml_model) + # hyperparams_file = find_hyperparams_filename(method, model_name) # hyperparams = read_yaml_from_file(hyperparams_file) - current_model = all_models[ml_model] + current_model = all_models[model_name] # model = current_model(**hyperparams) model = current_model() first_polys = train_dataset['projections'][0][0][0] @@ -94,7 +95,7 @@ def train_reinforcement_model(ml_model, method='Normal'): training_labels += new_training_labels model.fit(training_features, training_labels) print(test_reinforcement_model(model)) - trained_model_filename = find_model_filename('reinforcement', ml_model) + trained_model_filename = find_model_filename('reinforcement', model_name) with open(trained_model_filename, 'wb') as trained_model_file: pickle.dump(model, trained_model_file) @@ -167,14 +168,14 @@ def ordering_choice_reinforcement(model, projections): return ordering -def test_reinforcement_model(ml_model, method='Normal', nvar=3): +def test_reinforcement_model(model_name, method='Normal', nvar=3): train_data_filename = find_dataset_filename('Test', method=method) with open(train_data_filename, 'rb') as train_data_file: testing_dataset = pickle.load(train_data_file) - # trained_model_filename = find_model_filename('reinforcement', ml_model) + # trained_model_filename = find_model_filename('reinforcement', model_name) # with open(trained_model_filename, 'rb') as trained_model_file: # model = pickle.load(trained_model_file) - model = ml_model + model = model_name chosen_indices = [ordering_choice_reinforcement(model, projections) for projections in testing_dataset['projections']] metrics = compute_metrics(chosen_indices,