diff --git a/choose_hyperparams.py b/choose_hyperparams.py index 4e3c7da..b565d30 100644 --- a/choose_hyperparams.py +++ b/choose_hyperparams.py @@ -1,19 +1,3 @@ -""" -The experiments in [1] are replicated with some changes. - -The first change is that the testing data is balanced, so that all targets -are almost equally common. -Then we use three training sets; dataset as in [1], balanced dataset -and data augmentation dataset. - -[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline -to Pick the Variable Ordering for Algorithms with Polynomial Inputs. -Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds) -Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, -vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 -""" - - import os import pickle import csv @@ -47,66 +31,71 @@ def choose_hyperparams(ml_model, method): chosen by cross validation is created""" this_dataset_file = find_dataset_filename('train', method=method) with open(this_dataset_file, 'rb') as f: - method_x_train, method_y_train = pickle.load(f) - hyperparams = k_folds_ml(method_x_train, method_y_train, model=ml_model) + x_train, y_train, _ = pickle.load(f) + hyperparams = k_folds_ml(x_train, y_train, model=ml_model) hyperparams_filename = find_hyperparams_filename(method, ml_model) write_yaml_to_file(hyperparams, hyperparams_filename) -test_balanced_dataset_file = os.path.join(os.path.dirname(__file__), - 'datasets', 'test', - 'balanced_test_dataset.txt') -with open(test_balanced_dataset_file, 'rb') as g: - balanced_x_test, balanced_y_test = pickle.load(g) +# test_balanced_dataset_file = os.path.join(os.path.dirname(__file__), +# 'datasets', 'test', +# 'balanced_test_dataset.txt') +# with open(test_balanced_dataset_file, 'rb') as g: +# balanced_x_test, balanced_y_test = pickle.load(g) + +# test_normal_dataset_file = os.path.join(os.path.dirname(__file__), +# 'datasets', 'test', +# 'normal_test_dataset.txt') +# with open(test_normal_dataset_file, 'rb') as g: +# normal_x_test, normal_y_test = pickle.load(g) + +# output_file_balanced = os.path.join(os.path.dirname(__file__), +# 'ml_results_k_fold_tested_in_balanced.csv') +# with open(output_file_balanced, 'w') as f_balanced: +# writer_balanced = csv.writer(f_balanced) +# writer_balanced.writerow(["Name"] + dataset_types) +# output_file_normal = os.path.join(os.path.dirname(__file__), +# 'ml_results_k_fold_tested_in_normal.csv') +# with open(output_file_normal, 'w') as f_normal: +# writer_normal = csv.writer(f_normal) +# writer_normal.writerow(["Name"] + dataset_types) +# for ml_model in ml_models: +# print(f"Model: {ml_model}") +# acc_balanced = dict() +# acc_normal = dict() +# for method in dataset_types: +# this_dataset_file = os.path.join(os.path.dirname(__file__), +# 'datasets', 'train', +# f'{method}_train_dataset.txt') +# with open(this_dataset_file, 'rb') as f: +# x_train, y_train, _ = pickle.load(f) +# hyperparams = k_folds_ml(x_train, y_train, +# model=ml_model) +# write_yaml_to_file(hyperparams, +# os.path.join(os.path.dirname(__file__), +# 'config', 'hyperparams', +# f'{method}_{ml_model}')) +# current_classifier = classifiers[ml_model] +# clf = current_classifier(**hyperparams) +# clf.fit(x_train, y_train) +# acc_balanced[method] = clf.score(balanced_x_test, +# balanced_y_test) +# acc_normal[method] = clf.score(normal_x_test, normal_y_test) +# method_filename = os.path.join(os.path.dirname(__file__), +# 'config', 'models', +# f'{method}_trained_model.txt') +# with open(method_filename, 'wb') as method_file: +# pickle.dump(clf, method_file) +# round_accuracies_balanced = [round(acc, 2) +# for acc in [acc_balanced[method_here] +# for method_here in dataset_types]] +# round_accuracies_normal = [round(acc, 2) +# for acc in [acc_normal[method_here] +# for method_here in dataset_types]] +# writer_balanced.writerow([ml_model] + round_accuracies_balanced) +# writer_normal.writerow([ml_model] + round_accuracies_normal) -test_normal_dataset_file = os.path.join(os.path.dirname(__file__), - 'datasets', 'test', - 'normal_test_dataset.txt') -with open(test_normal_dataset_file, 'rb') as g: - normal_x_test, normal_y_test = pickle.load(g) -output_file_balanced = os.path.join(os.path.dirname(__file__), - 'ml_results_k_fold_tested_in_balanced.csv') -with open(output_file_balanced, 'w') as f_balanced: - writer_balanced = csv.writer(f_balanced) - writer_balanced.writerow(["Name"] + dataset_types) - output_file_normal = os.path.join(os.path.dirname(__file__), - 'ml_results_k_fold_tested_in_normal.csv') - with open(output_file_normal, 'w') as f_normal: - writer_normal = csv.writer(f_normal) - writer_normal.writerow(["Name"] + dataset_types) - for ml_model in ml_models: - print(f"Model: {ml_model}") - acc_balanced = dict() - acc_normal = dict() - for method in dataset_types: - this_dataset_file = os.path.join(os.path.dirname(__file__), - 'datasets', 'train', - f'{method}_train_dataset.txt') - with open(this_dataset_file, 'rb') as f: - method_x_train, method_y_train = pickle.load(f) - hyperparams = k_folds_ml(method_x_train, method_y_train, - model=ml_model) - write_yaml_to_file(hyperparams, - os.path.join(os.path.dirname(__file__), - 'config', 'hyperparams', - f'{method}_{ml_model}')) - current_classifier = classifiers[ml_model] - clf = current_classifier(**hyperparams) - clf.fit(method_x_train, method_y_train) - acc_balanced[method] = clf.score(balanced_x_test, - balanced_y_test) - acc_normal[method] = clf.score(normal_x_test, normal_y_test) - method_file = os.path.join(os.path.dirname(__file__), - 'config', 'models', - f'{method}_trained_model.txt') - with open(method_file, 'wb') as f_method: - pickle.dump(clf, f_method) - round_accuracies_balanced = [round(acc, 2) - for acc in [acc_balanced[method_here] - for method_here in dataset_types]] - round_accuracies_normal = [round(acc, 2) - for acc in [acc_normal[method_here] - for method_here in dataset_types]] - writer_balanced.writerow([ml_model] + round_accuracies_balanced) - writer_normal.writerow([ml_model] + round_accuracies_normal) +# model = 'KNN' +# method = 'balanced' +# choose_hyperparams(model, method) \ No newline at end of file diff --git a/create_clean_dataset.py b/create_clean_dataset.py index c827fa4..2b2d463 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -1,3 +1,8 @@ +"""This file contains a function that given the raw dataset containing +the sets of polynomials and its timings for each order, creates a dataset +containing a set of unique features and its class""" + +import os import pickle import numpy as np from replicating_Dorians_features import extract_features @@ -6,6 +11,30 @@ from dataset_manipulation import remove_notunique_features else: from packages.dataset_manipulation import remove_notunique_features +from from_poly_set_to_features import poly_set_feature_extractor + + +def create_dataframe(dataset): + all_features = [] + all_targets = dataset[1][:] + all_timings = dataset[2][:] + all_original_polynomials = [] + for index, all_projections in enumerate(dataset[0]): + original_polynomials = all_projections[0][0] + all_original_polynomials.append(original_polynomials) + names, all_features = poly_set_feature_extractor(all_original_polynomials, + determine_standarization=True, + determine_unique_features=True) + return np.array(all_original_polynomials), np.array(names),\ + np.array(all_features), np.array(all_targets), np.array(all_timings) + + +dataset_filename = os.path.join(os.path.dirname(__file__), + 'DatasetsBeforeProcessing', + 'dataset_without_repetition_return_ncells.txt') +with open(dataset_filename, 'rb') as f: + dataset = pickle.load(f) +original_polys_list, names, features_list, targets_list, timings_list = create_dataframe(dataset) def cleaning_dataset(dataset_filename, clean_dataset_filename): @@ -20,7 +49,6 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename): targets = np.array(targets_list) timings = np.array(timings_list) original_polys = np.array(original_polys_list) - with open(clean_dataset_filename, 'wb') as clean_dataset_file: dataset = pickle.dump((original_polys, unique_names, unique_features, targets, timings), diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv index a4eeeff..ea0d247 100644 --- a/datasets/dataset_instances.csv +++ b/datasets/dataset_instances.csv @@ -1,7 +1,7 @@ dataset,zero,one,two,three,four,five,total train normal dataset,326,74,105,41,163,106,815 -train balanced dataset,130,120,135,143,135,152,815 +train balanced dataset,118,136,125,149,134,153,815 train augmented dataset,815,815,815,815,815,815,4890 test normal dataset,80,19,30,10,39,26,204 -test balanced dataset,34,31,32,37,39,31,204 +test balanced dataset,39,32,36,29,31,37,204 test augmented dataset,204,204,204,204,204,204,1224 diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index 58e811f..a31e8f4 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index c0686fb..f289c71 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt index 90a466f..e5e60af 100644 Binary files a/datasets/test/normal_test_dataset.txt and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index c40cd59..f75da24 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 34573c3..248a130 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt index 4ef767e..c5f60ab 100644 Binary files a/datasets/train/normal_train_dataset.txt and b/datasets/train/normal_train_dataset.txt differ diff --git a/main.py b/main.py index 1a2eca1..6982b59 100644 --- a/main.py +++ b/main.py @@ -22,16 +22,20 @@ from test_models import test_results -original_dataset_file = find_dataset_filename('unclean') -clean_dataset_filename = find_dataset_filename('clean') -cleaning_dataset(original_dataset_file, clean_dataset_filename) -create_train_test_datasets() +# original_dataset_file = find_dataset_filename('unclean') +# clean_dataset_filename = find_dataset_filename('clean') +# cleaning_dataset(original_dataset_file, clean_dataset_filename) +# create_train_test_datasets() +# for ml_model in ml_models: +# for method in dataset_types: +# print(f"Choosing hyperparameters for {ml_model} in {method}") +# choose_hyperparams(ml_model, method) for ml_model in ml_models: + print(f"Training {ml_model}") for method in dataset_types: - choose_hyperparams(ml_model, method) -for ml_model in ml_models: - for method in dataset_types: + print(f"for {method}") train_model(ml_model, method) -for testing_method in ['normal', 'balanced']: +for testing_method in dataset_types: + print(f"Testing {testing_method}") test_results(testing_method) diff --git a/packages/build/lib/dataset_manipulation/__init__.py b/packages/build/lib/dataset_manipulation/__init__.py index 0107d58..13a0493 100644 --- a/packages/build/lib/dataset_manipulation/__init__.py +++ b/packages/build/lib/dataset_manipulation/__init__.py @@ -1,4 +1,5 @@ from .dataset_manipulation import augmentate_dataset # noqa401 from .dataset_manipulation import balance_dataset # noqa401 from .dataset_manipulation import name_unique_features # noqa401 -from .dataset_manipulation import remove_notunique_features # noqa401 \ No newline at end of file +from .dataset_manipulation import remove_notunique_features # noqa401 +from .exploit_symmetries import give_all_symmetries # noqa401 \ No newline at end of file diff --git a/packages/build/lib/dataset_manipulation/dataset_manipulation.py b/packages/build/lib/dataset_manipulation/dataset_manipulation.py index 8c90979..974ce1a 100644 --- a/packages/build/lib/dataset_manipulation/dataset_manipulation.py +++ b/packages/build/lib/dataset_manipulation/dataset_manipulation.py @@ -3,11 +3,12 @@ import math import random from .exploit_symmetries import give_all_symmetries +# from sklearn.preprocessing import normalize nvar = 3 -def augmentate_dataset(features, targets): +def augmentate_dataset(features, targets, timings): """ Multiply the size of the dataset by 6. @@ -17,13 +18,16 @@ def augmentate_dataset(features, targets): """ symmetric_features = [] symmetric_targets = [] - for features, target in zip(features, targets): + symmetric_timings = [] + for features, target, timing in zip(features, targets, timings): symmetric_features += give_all_symmetries(features, int(target)) symmetric_targets += list(range(math.factorial(nvar))) - return np.array(symmetric_features), np.array(symmetric_targets) + symmetric_timings += list(timing) + return np.array(symmetric_features), np.array(symmetric_targets), \ + np.array(symmetric_timings) -def balance_dataset(features, targets): +def balance_dataset(features, targets, timings): """ Balance the dataset so all targets are almost equally common. @@ -33,13 +37,16 @@ def balance_dataset(features, targets): """ balanced_features = [] balanced_targets = [] - for features, target in zip(features, targets): + balanced_timings = [] + for features, target, timing in zip(features, targets, timings): symmetric_features = give_all_symmetries(features, int(target)) possible_targets = list(range(math.factorial(nvar))) new_target = random.choice(possible_targets) balanced_features.append(symmetric_features[new_target]) balanced_targets.append(new_target) - return np.array(balanced_features), np.array(balanced_targets) + balanced_timings.append(timing[new_target]) + return np.array(balanced_features), np.array(balanced_targets),\ + np.array(balanced_timings) def name_unique_features(names, features): @@ -57,16 +64,36 @@ def name_unique_features(names, features): for ex_feature in new_features]) or np.std(feature) == 0): rep += 1 + elif feature.count(feature[0]) == len(feature): + print(names[index]) else: + # if 'max_in_polys_max_sig'==names[index][:20]: + # print("Check ", feature.count(feature[0])==len(feature)) + # print(names[index]) + # print(len(feature)) new_features.append(feature) new_names.append(names[index]) return new_names -def remove_notunique_features(unique_names, names, features): +def get_unique_feature_names(unique_names, names, features): """Return the features corresponding to a name in 'unique_names'.""" unique_features = [] for index, feature in enumerate(zip(*features)): if names[index] in unique_names: unique_features.append(feature) return np.transpose(unique_features) + + +def remove_notunique_features(names, features): + # creating some targets and timing because the function requires them + targets = [0]*len(features) + timings = [[0, 0]]*len(features) + augmented_features, _, _ = augmentate_dataset(features, targets, timings) + # normalized_augmented_features = normalize(augmented_features) + unique_names = name_unique_features(names, augmented_features) + unique_features = [] + for index, feature in enumerate(zip(*features)): + if names[index] in unique_names: + unique_features.append(feature) + return unique_names, np.transpose(unique_features) diff --git a/packages/build/lib/dataset_manipulation/exploit_symmetries.py b/packages/build/lib/dataset_manipulation/exploit_symmetries.py index 0422b8d..4f41a3c 100644 --- a/packages/build/lib/dataset_manipulation/exploit_symmetries.py +++ b/packages/build/lib/dataset_manipulation/exploit_symmetries.py @@ -37,7 +37,9 @@ def features_to_canonical_target(features, optimal_ordering): def give_all_symmetries(features, optimal_ordering): - """Reorder the features for all possible targets.""" + """Reorder the features for all possible targets. + Returns a list of of all symmetries, the first one + corresponding to the optimal ordering""" ordered_features = features_to_canonical_target(features, optimal_ordering) all_symmetries = [] diff --git a/packages/dataset_manipulation/__init__.py b/packages/dataset_manipulation/__init__.py index 0107d58..13a0493 100644 --- a/packages/dataset_manipulation/__init__.py +++ b/packages/dataset_manipulation/__init__.py @@ -1,4 +1,5 @@ from .dataset_manipulation import augmentate_dataset # noqa401 from .dataset_manipulation import balance_dataset # noqa401 from .dataset_manipulation import name_unique_features # noqa401 -from .dataset_manipulation import remove_notunique_features # noqa401 \ No newline at end of file +from .dataset_manipulation import remove_notunique_features # noqa401 +from .exploit_symmetries import give_all_symmetries # noqa401 \ No newline at end of file diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py index cdc82d4..008b365 100644 --- a/packages/dataset_manipulation/dataset_manipulation.py +++ b/packages/dataset_manipulation/dataset_manipulation.py @@ -3,7 +3,7 @@ import math import random from .exploit_symmetries import give_all_symmetries -from sklearn.preprocessing import normalize +# from sklearn.preprocessing import normalize nvar = 3 @@ -23,7 +23,8 @@ def augmentate_dataset(features, targets, timings): symmetric_features += give_all_symmetries(features, int(target)) symmetric_targets += list(range(math.factorial(nvar))) symmetric_timings += list(timing) - return np.array(symmetric_features), np.array(symmetric_targets), np.array(symmetric_timings) + return np.array(symmetric_features), np.array(symmetric_targets), \ + np.array(symmetric_timings) def balance_dataset(features, targets, timings): @@ -44,7 +45,8 @@ def balance_dataset(features, targets, timings): balanced_features.append(symmetric_features[new_target]) balanced_targets.append(new_target) balanced_timings.append(timing[new_target]) - return np.array(balanced_features), np.array(balanced_targets), np.array(balanced_timings) + return np.array(balanced_features), np.array(balanced_targets),\ + np.array(balanced_timings) def name_unique_features(names, features): @@ -58,11 +60,14 @@ def name_unique_features(names, features): new_names = [] rep = 0 for index, feature in enumerate(zip(*features)): + # print(feature) + # if any([type(xfeature) == str for xfeature in feature]): + # print(feature) if (any([np.array_equal(feature, ex_feature) for ex_feature in new_features]) or np.std(feature) == 0): rep += 1 - elif feature.count(feature[0])==len(feature): + elif feature.count(feature[0]) == len(feature): print(names[index]) else: # if 'max_in_polys_max_sig'==names[index][:20]: @@ -86,7 +91,7 @@ def get_unique_feature_names(unique_names, names, features): def remove_notunique_features(names, features): # creating some targets and timing because the function requires them targets = [0]*len(features) - timings = [[0,0]]*len(features) + timings = [[0, 0]]*len(features) augmented_features, _, _ = augmentate_dataset(features, targets, timings) # normalized_augmented_features = normalize(augmented_features) unique_names = name_unique_features(names, augmented_features) diff --git a/packages/dataset_manipulation/exploit_symmetries.py b/packages/dataset_manipulation/exploit_symmetries.py index 0422b8d..4f41a3c 100644 --- a/packages/dataset_manipulation/exploit_symmetries.py +++ b/packages/dataset_manipulation/exploit_symmetries.py @@ -37,7 +37,9 @@ def features_to_canonical_target(features, optimal_ordering): def give_all_symmetries(features, optimal_ordering): - """Reorder the features for all possible targets.""" + """Reorder the features for all possible targets. + Returns a list of of all symmetries, the first one + corresponding to the optimal ordering""" ordered_features = features_to_canonical_target(features, optimal_ordering) all_symmetries = [] diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index 798778a..b15013f 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -3,29 +3,28 @@ from xml.sax.handler import all_features import numpy as np -nvar=3 - - - def aveg(given_list): return sum(given_list)/len(given_list) + def aveg_not_zero(given_list): return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0])) + def identity(input): return input + def sign(input): - if type(input)==list: + if type(input) == list: return [sign(elem) for elem in input] else: - if input>0: + if input > 0: return 1 - elif input<0: + elif input < 0: return -1 - elif input==0: + elif input == 0: return 0 else: raise Exception("How is this possible?") @@ -51,21 +50,29 @@ def extract_features(dataset): all_original_polynomials = [] for index, all_projections in enumerate(dataset[0]): original_polynomials = all_projections[0][0] + # the original polynomials are the initial polynomials of any + # of the possible projections (also of the first one) all_original_polynomials.append(original_polynomials) - names = [] - instance_features = [] all_targets.append(dataset[1][index]) all_timings.append(dataset[2][index]) - for var in range(nvar): - degrees = [[monomial[var] for monomial in poly] - for poly in original_polynomials] - var_features, var_features_names = create_features(degrees, - variable=var) - instance_features += var_features - names += var_features_names - sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials] - svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True) - instance_features += svar_features - names += svar_features_names + names, instance_features = features_from_set_of_polys(original_polynomials) all_features.append(instance_features) return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings) + + +def features_from_set_of_polys(original_polynomials): + instance_features = [] + names = [] + nvar = len(original_polynomials[0][0]) - 1 + for var in range(nvar): + degrees = [[monomial[var] for monomial in poly] + for poly in original_polynomials] + var_features, var_features_names = create_features(degrees, + variable=var) + instance_features += var_features + names += var_features_names + sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials] + svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True) + instance_features += svar_features + names += svar_features_names + return names, instance_features diff --git a/test_train_datasets.py b/test_train_datasets.py index 53d60d3..c2b2a86 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -1,20 +1,3 @@ -""" -The experiments in [1] are replicated with some changes. - -The first change is that the testing data is balanced, so that all targets -are almost equally common. -Then we use three training sets; dataset as in [1], balanced dataset -and data augmentation dataset. - -[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline -to Pick the Variable Ordering for Algorithms with Polynomial Inputs. -Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds) -Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, -vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 -""" - - -import os import pickle import csv import importlib.util @@ -29,6 +12,7 @@ from packages.dataset_manipulation import augmentate_dataset from sklearn.model_selection import train_test_split from find_filename import find_dataset_filename +from find_filename import find_other_filename def count_instances(my_dataset, instance): @@ -40,7 +24,9 @@ def create_train_test_datasets(): with open(clean_dataset_filename, 'rb') as clean_dataset_file: _, names, features, targets, timings = pickle.load(clean_dataset_file) unique_names, unique_features = remove_notunique_features(names, features) - + unique_features_filename = find_other_filename("unique_features") + with open(unique_features_filename, 'wb') as unique_features_file: + pickle.dump(unique_features_filename, unique_features_file) x = dict() # to keep the features y = dict() # to keep the labels t = dict() # to keep the timings @@ -58,11 +44,9 @@ def create_train_test_datasets(): writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total']) for usage in ['train', 'test']: for method in ['normal', 'balanced', 'augmented']: - this_dataset_file = os.path.join(os.path.dirname(__file__), - 'datasets', usage, - f'{method}_{usage}_dataset.txt') - with open(this_dataset_file, 'wb') as f: - pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f) + this_dataset_filename = find_dataset_filename(usage, method=method) + with open(this_dataset_filename, 'wb') as this_dataset_file: + pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file) writer.writerow([f'{usage} {method} dataset'] + [str(count_instances(y[f'{usage}_{method}'], i)) diff --git a/train_models.py b/train_models.py index 0576dcd..044c15b 100644 --- a/train_models.py +++ b/train_models.py @@ -1,16 +1,21 @@ import pickle from yaml_tools import read_yaml_from_file from config.ml_models import classifiers +from config.ml_models import ml_regressors +from config.ml_models import regressors from find_filename import find_dataset_filename from find_filename import find_hyperparams_filename from find_filename import find_model_filename +from dataset_manipulation import give_all_symmetries +import numpy as np +from sklearn import metrics def train_model(ml_model, method): train_data_filename = find_dataset_filename('train', method=method) hyperparams_file = find_hyperparams_filename(method, ml_model) with open(train_data_filename, 'rb') as train_data_file: - x_train, y_train = pickle.load(train_data_file) + x_train, y_train, _ = pickle.load(train_data_file) hyperparams = read_yaml_from_file(hyperparams_file) current_classifier = classifiers[ml_model] clf = current_classifier(**hyperparams) @@ -18,3 +23,53 @@ def train_model(ml_model, method): trained_model_filename = find_model_filename(method, ml_model) with open(trained_model_filename, 'wb') as trained_model_file: pickle.dump(clf, trained_model_file) + + +def train_regression_model(ml_model, method): + train_data_filename = find_dataset_filename('train', method=method) + with open(train_data_filename, 'rb') as train_data_file: + x_train, _, t_train = pickle.load(train_data_file) + # hyperparams_file = find_hyperparams_filename(method, ml_model) + # hyperparams = read_yaml_from_file(hyperparams_file) + x_train = np.asarray([x_t for x_t, t_t in zip(x_train, t_train) + if t_t[:4] != 'Over'], dtype=float) + t_train = np.asarray([t_t for t_t in t_train + if t_t[:4] != 'Over'], dtype=float) + current_classifier = regressors[ml_model] + # print(t_train) + print("her") + reg = current_classifier() # **hyperparams) + reg.fit(x_train, t_train) + # trained_model_filename = find_model_filename(method, ml_model, 'regression') + # with open(trained_model_filename, 'wb') as trained_model_file: + # pickle.dump(reg, trained_model_file) + print("Real") + print(t_train[10:20]) + print("Predicted") + print(reg.predict(x_train)[10:20]) + print(metrics.mean_squared_error(reg.predict(x_train), t_train)) + return reg + + +def choose_using_regression(x_test, regressor): + timings = regressor.predict(give_all_symmetries(x_test, 0)) + return np.argmin(timings) + + +def test_regression_model(method, regressor): + test_data_filename = find_dataset_filename('test', method=method) + with open(test_data_filename, 'rb') as test_data_file: + x_test, y_test, t_test = pickle.load(test_data_file) + x_test = np.asarray([x_t for x_t, t_t in zip(x_test, t_test) + if t_t[:4] != 'Over'], dtype=float) + y_test = np.asarray([y_t for y_t, t_t in zip(y_test, t_test) + if t_t[:4] != 'Over'], dtype=float) + y_pred = [choose_using_regression(x_i, regressor) for x_i in x_test] + print("ACC", metrics.accuracy_score(y_test, y_pred)) + + +# for ml_reg in ml_regressors: +# print(ml_reg) +# regressor = train_regression_model(ml_reg, 'balanced') +# print(ml_reg) +# test_regression_model('balanced', regressor)