diff --git a/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt b/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt new file mode 100644 index 0000000..7e9e028 Binary files /dev/null and b/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt differ diff --git a/choose_hyperparams.py b/choose_hyperparams.py new file mode 100644 index 0000000..67a634c --- /dev/null +++ b/choose_hyperparams.py @@ -0,0 +1,106 @@ +""" +The experiments in [1] are replicated with some changes. + +The first change is that the testing data is balanced, so that all targets +are almost equally common. +Then we use three training sets; dataset as in [1], balanced dataset +and data augmentation dataset. + +[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline +to Pick the Variable Ordering for Algorithms with Polynomial Inputs. +Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds) +Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, +vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 +""" + + +import os +import pickle +import csv +import yaml +import importlib.util +from config.ml_models import ml_models +from config.ml_models import classifiers +from config.ml_models import dataset_types +from config.hyperparameters_grid import grid +from sklearn.model_selection import GridSearchCV + + +def write_yaml_to_file(py_obj, filename): + with open(f'{filename}.yaml', 'w',) as f: + yaml.dump(py_obj, f, sort_keys=False) + print('Written to file successfully') + + +def k_folds_ml(x_train, y_train, model, random_state=0): + """ + Train the desired model. + + The hyperparameters of the models are chosen using 5-fold cross validation. + """ + current_classifier = classifiers[model] + current_grid = grid[model] + rf_cv = GridSearchCV(estimator=current_classifier(), + param_grid=current_grid, + cv=5) + rf_cv.fit(x_train, y_train) + return rf_cv.best_params_ + + +test_balanced_dataset_file = os.path.join(os.path.dirname(__file__), + 'datasets', 'test', + 'balanced_test_dataset.txt') +with open(test_balanced_dataset_file, 'rb') as g: + balanced_x_test, balanced_y_test = pickle.load(g) + +test_normal_dataset_file = os.path.join(os.path.dirname(__file__), + 'datasets', 'test', + 'normal_test_dataset.txt') +with open(test_normal_dataset_file, 'rb') as g: + normal_x_test, normal_y_test = pickle.load(g) + +output_file_balanced = os.path.join(os.path.dirname(__file__), + 'ml_results_k_fold_tested_in_balanced.csv') +with open(output_file_balanced, 'w') as f_balanced: + writer_balanced = csv.writer(f_balanced) + writer_balanced.writerow(["Name"] + dataset_types) + output_file_normal = os.path.join(os.path.dirname(__file__), + 'ml_results_k_fold_tested_in_normal.csv') + with open(output_file_normal, 'w') as f_normal: + writer_normal = csv.writer(f_normal) + writer_normal.writerow(["Name"] + dataset_types) + for ml_model in ml_models: + print(f"Model: {ml_model}") + acc_balanced = dict() + acc_normal = dict() + for method in dataset_types: + this_dataset_file = os.path.join(os.path.dirname(__file__), + 'datasets', 'train', + f'{method}_train_dataset.txt') + with open(this_dataset_file, 'rb') as f: + method_x_train, method_y_train = pickle.load(f) + hyperparams = k_folds_ml(method_x_train, method_y_train, + model=ml_model) + write_yaml_to_file(hyperparams, + os.path.join(os.path.dirname(__file__), + 'config', 'hyperparams', + f'{method}_{ml_model}')) + current_classifier = classifiers[ml_model] + clf = current_classifier(**hyperparams) + clf.fit(method_x_train, method_y_train) + acc_balanced[method] = clf.score(balanced_x_test, + balanced_y_test) + acc_normal[method] = clf.score(normal_x_test, normal_y_test) + method_file = os.path.join(os.path.dirname(__file__), + 'config', 'models', + f'{method}_trained_model.txt') + with open(method_file, 'wb') as f_method: + pickle.dump(clf, f_method) + round_accuracies_balanced = [round(acc, 2) + for acc in [acc_balanced[method_here] + for method_here in dataset_types]] + round_accuracies_normal = [round(acc, 2) + for acc in [acc_normal[method_here] + for method_here in dataset_types]] + writer_balanced.writerow([ml_model] + round_accuracies_balanced) + writer_normal.writerow([ml_model] + round_accuracies_normal) diff --git a/create_clean_dataset.py b/create_clean_dataset.py new file mode 100644 index 0000000..d447580 --- /dev/null +++ b/create_clean_dataset.py @@ -0,0 +1,26 @@ +import pickle +import numpy as np +from replicating_Dorians_features import extract_features +from basic_ml import use_tf, basic_ml +from itertools import product +import sys +import os +import csv + + +dataset_file = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt') +f = open(dataset_file, 'rb') +dataset = pickle.load(f) +original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset) + +# working with raw features +features = np.array(features_list) +targets = np.array(targets_list) +timings = np.array(timings_list) +original_polys = np.array(original_polys_list) + +clean_dataset_file = os.path.join(os.path.dirname(__file__), + 'datasets', + 'clean_dataset.txt') +g = open(clean_dataset_file, 'wb') +dataset = pickle.dump((original_polys, names, features, targets, timings), g) diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt new file mode 100644 index 0000000..333c663 Binary files /dev/null and b/datasets/clean_dataset.txt differ diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv new file mode 100644 index 0000000..6d1dc65 --- /dev/null +++ b/datasets/dataset_instances.csv @@ -0,0 +1,7 @@ +dataset,zero,one,two,three,four,five,total +train normal dataset,326,74,105,41,163,106,815 +train balanced dataset,126,113,149,138,144,145,815 +train augmented dataset,815,815,815,815,815,815,4890 +test normal dataset,80,19,30,10,39,26,204 +test balanced dataset,31,34,32,38,34,35,204 +test augmented dataset,204,204,204,204,204,204,1224 diff --git a/datasets/dataset_without_repetition.txt b/datasets/dataset_without_repetition.txt new file mode 100644 index 0000000..f8ccd14 Binary files /dev/null and b/datasets/dataset_without_repetition.txt differ diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt new file mode 100644 index 0000000..5f66bb2 Binary files /dev/null and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt new file mode 100644 index 0000000..e9b91f0 Binary files /dev/null and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt new file mode 100644 index 0000000..8d72352 Binary files /dev/null and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt new file mode 100644 index 0000000..00989a1 Binary files /dev/null and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt new file mode 100644 index 0000000..731c787 Binary files /dev/null and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt new file mode 100644 index 0000000..e9096f9 Binary files /dev/null and b/datasets/train/normal_train_dataset.txt differ diff --git a/main.py b/main.py index a7407a7..71bae95 100644 --- a/main.py +++ b/main.py @@ -18,6 +18,7 @@ import pickle import random import csv +import yaml import importlib.util # Check if 'dataset_manipulation' is installed if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): @@ -31,70 +32,105 @@ from packages.dataset_manipulation import balance_dataset from packages.dataset_manipulation import augmentate_dataset from sklearn.preprocessing import normalize -from preprocessing_Dorians_features import normalize_features # noqa401 from sklearn.model_selection import train_test_split from basic_ml import basic_ml +from k_folds_ml import k_folds_ml + +def write_yaml_to_file(py_obj,filename): + with open(f'{filename}.yaml', 'w',) as f : + yaml.dump(py_obj,f,sort_keys=False) + print('Written to file successfully') + names_features_targets_file = os.path.join(os.path.dirname(__file__), 'datasets', - 'names_features_targets.txt') + 'clean_dataset.txt') with open(names_features_targets_file, 'rb') as f: - names, features, targets = pickle.load(f) -augmented_features, augmented_targets = augmentate_dataset(features, targets) + original_polys, names, features, targets, timings = pickle.load(f) + + +augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings) normalized_augmented_features = normalize(augmented_features) # an alternative approach to normalizing # features = np.transpose(normalize_features(features)) unique_names = name_unique_features(names, - normalized_augmented_features) + augmented_features) random_state = 0 # Other random states may be tried to check that similar results are achieved random.seed(random_state) # Models that will be used are chosen -ml_models = ['SVC', 'DT', 'KNN', 'RF', 'MPL', 'my_mlp'] +ml_models = ['KNN', 'DT', 'MLP', 'SVC', 'RF'] # , 'my_mlp' # train and test sets are created -x_train, x_test, y_train, y_test = train_test_split(features, targets, +x_train, x_test, y_train, y_test, t_train, t_test = train_test_split(features, targets, timings, test_size=0.20, random_state=random_state) # test features are balanced -bal_x_test, bal_y_test = balance_dataset(x_test, y_test) -# and the repeated features are removed before presenting them to any model +bal_x_test, bal_y_test, bal_t_test = balance_dataset(x_test, y_test, t_test) +# and the repeated features are removed before presenting them to any ml_model # we will ensure that instances send to the models dont have repeated features unique_bal_x_test = remove_notunique_features(unique_names, names, bal_x_test) # testing data for all approaches is ready unique_x_train = remove_notunique_features(unique_names, names, x_train) # training data without changes ready -bal_x_train, bal_y_train = balance_dataset(x_train, y_train) +bal_x_train, bal_y_train, bal_t_train = balance_dataset(x_train, y_train, t_train) unique_bal_x_train = remove_notunique_features(unique_names, names, bal_x_train) # balanced training data ready -aug_x_train, aug_y_train = augmentate_dataset(x_train, y_train) +aug_x_train, aug_y_train, aug_t_train = augmentate_dataset(x_train, y_train, t_train) unique_aug_x_train = remove_notunique_features(unique_names, names, aug_x_train) # augmented training data ready +# output_file = os.path.join(os.path.dirname(__file__), +# 'ml_results.csv') +# with open(output_file, 'w') as f: +# writer = csv.writer(f) +# writer.writerow(["Name", "Normal", "Balance data", "Augment data"]) +# for ml_model in ml_models: +# acc_basic = basic_ml(unique_x_train, unique_bal_x_test, +# y_train, bal_y_test, +# ml_model, random_state=random_state) + +# acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test, +# bal_y_train, bal_y_test, +# ml_model, random_state=random_state) + +# acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test, +# aug_y_train, bal_y_test, +# ml_model, random_state=random_state) + +# round_accuracies = [round(acc, 2) for acc in [acc_basic, +# acc_bal, +# acc_augmented]] +# writer.writerow([ml_model] + round_accuracies) + +# output_file = os.path.join(os.path.dirname(__file__), +# 'ml_results_k_fold.csv') +# with open(output_file, 'w') as f: +# writer = csv.writer(f) +# writer.writerow(["Name", "Normal", "Balance data", "Augment data"]) +# print(f"{method}") + # print(f"The accuracies of {ml_model} are:\n Normal: {acc_basic} \n Balanced: {acc_bal}\n Augmented: {acc_augmented}") + + # round_accuracies = [round(acc, 2) for acc in [acc_basic, + # acc_bal, + # acc_augmented]] + # writer.writerow([ml_model] + round_accuracies) -output_file = os.path.join(os.path.dirname(__file__), - 'ml_results.csv') -with open(output_file, 'w') as f: - writer = csv.writer(f) - writer.writerow(["Name", "Normal", "Balance data", "Augment data"]) - for ml_model in ml_models: - acc_basic = basic_ml(unique_x_train, unique_bal_x_test, - y_train, bal_y_test, - ml_model, random_state=random_state) - - acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test, - bal_y_train, bal_y_test, - ml_model, random_state=random_state) - - acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test, - aug_y_train, bal_y_test, - ml_model, random_state=random_state) - - round_accuracies = [round(acc, 2) for acc in [acc_basic, - acc_bal, - acc_augmented]] - writer.writerow([ml_model] + round_accuracies) +x_and_y_per_method = dict() +x_and_y_per_method['basic'] = (unique_x_train, y_train) +x_and_y_per_method['balanced'] = (unique_bal_x_train, bal_y_train) +x_and_y_per_method['augmented'] = (unique_aug_x_train, aug_y_train) +for ml_model in ml_models: + print(f"Model: {ml_model}") + for method in ['basic', 'balanced', 'augmented']: + method_x_train, method_y_train = x_and_y_per_method[method] + hyperparams = k_folds_ml(method_x_train, method_y_train, + model=ml_model) + write_yaml_to_file(hyperparams, + f'UsingDorianFeatures\\config\\hyperparams\\{method}_{ml_model}') + for train_data in ['basic', 'balanced']: + clf = ml_model() \ No newline at end of file diff --git a/ml_results.csv b/ml_results.csv index d5aa215..5ae9217 100644 --- a/ml_results.csv +++ b/ml_results.csv @@ -2,6 +2,6 @@ Name,Normal,Balance data,Augment data SVC,0.21,0.21,0.16 DT,0.32,0.27,0.34 KNN,0.29,0.39,0.51 -RF,0.38,0.49,0.55 +RF,0.42,0.5,0.61 MPL,0.21,0.2,0.17 -my_mlp,0.28,0.32,0.36 +my_mlp,0.36,0.33,0.38 diff --git a/packages/build/lib/dataset_manipulation/__init__.py b/packages/build/lib/dataset_manipulation/__init__.py new file mode 100644 index 0000000..0107d58 --- /dev/null +++ b/packages/build/lib/dataset_manipulation/__init__.py @@ -0,0 +1,4 @@ +from .dataset_manipulation import augmentate_dataset # noqa401 +from .dataset_manipulation import balance_dataset # noqa401 +from .dataset_manipulation import name_unique_features # noqa401 +from .dataset_manipulation import remove_notunique_features # noqa401 \ No newline at end of file diff --git a/packages/build/lib/dataset_manipulation/dataset_manipulation.py b/packages/build/lib/dataset_manipulation/dataset_manipulation.py new file mode 100644 index 0000000..8c90979 --- /dev/null +++ b/packages/build/lib/dataset_manipulation/dataset_manipulation.py @@ -0,0 +1,72 @@ +"""Exploit symmetries in polynomials to augmentate or balance the dataset.""" +import numpy as np +import math +import random +from .exploit_symmetries import give_all_symmetries + +nvar = 3 + + +def augmentate_dataset(features, targets): + """ + Multiply the size of the dataset by 6. + + Arguments: + features: list(list(numpy.float)) + targets: list(numpy.float) + """ + symmetric_features = [] + symmetric_targets = [] + for features, target in zip(features, targets): + symmetric_features += give_all_symmetries(features, int(target)) + symmetric_targets += list(range(math.factorial(nvar))) + return np.array(symmetric_features), np.array(symmetric_targets) + + +def balance_dataset(features, targets): + """ + Balance the dataset so all targets are almost equally common. + + Arguments: + features: list(list(numpy.float)) + targets: list(numpy.float) + """ + balanced_features = [] + balanced_targets = [] + for features, target in zip(features, targets): + symmetric_features = give_all_symmetries(features, int(target)) + possible_targets = list(range(math.factorial(nvar))) + new_target = random.choice(possible_targets) + balanced_features.append(symmetric_features[new_target]) + balanced_targets.append(new_target) + return np.array(balanced_features), np.array(balanced_targets) + + +def name_unique_features(names, features): + """ + Return the name of unique features. + + When two features share the same value for all the instances + one of them is not considered unique. + """ + new_features = [] + new_names = [] + rep = 0 + for index, feature in enumerate(zip(*features)): + if (any([np.array_equal(feature, ex_feature) + for ex_feature in new_features]) + or np.std(feature) == 0): + rep += 1 + else: + new_features.append(feature) + new_names.append(names[index]) + return new_names + + +def remove_notunique_features(unique_names, names, features): + """Return the features corresponding to a name in 'unique_names'.""" + unique_features = [] + for index, feature in enumerate(zip(*features)): + if names[index] in unique_names: + unique_features.append(feature) + return np.transpose(unique_features) diff --git a/packages/build/lib/dataset_manipulation/exploit_symmetries.py b/packages/build/lib/dataset_manipulation/exploit_symmetries.py new file mode 100644 index 0000000..0422b8d --- /dev/null +++ b/packages/build/lib/dataset_manipulation/exploit_symmetries.py @@ -0,0 +1,51 @@ +""" +Exploit symmetries in three variable polynomials to generate up to six +instances out of each existing one. + +The task at hand consists in classify this features. +We will take advantage of the fact that we can change the target by +reordering the features. + +This file will contain: +- features_to_canonical_ordering: a function able to reorder the features so +that the target becomes '1', this ordering is called the canonical order. +- give_all_symmetries: a function that given the canonical order returns the +reorderings for each of the possible targets '1','2',...,'6'. +""" +from itertools import permutations + +nvar = 3 +variables = list(range(nvar)) +perms = [list(elem) for elem in permutations(variables)] + + +def features_to_canonical_target(features, optimal_ordering): + """ + Reorder the features for the target to be '1'. + + This is done by reordering the features according to the optimal variable + ordering of the set of polynomials. + """ + variable_orderings = perms[optimal_ordering] + nfeatures = len(features) + split_features = [features[int(var*nfeatures/nvar): + int((var+1)*nfeatures/nvar)] + for var in range(nvar)] + ordered_features = [split_features[variable_orderings[i]] + for i in range(nvar)] + return ordered_features + + +def give_all_symmetries(features, optimal_ordering): + """Reorder the features for all possible targets.""" + ordered_features = features_to_canonical_target(features, + optimal_ordering) + all_symmetries = [] + for perm in perms: + new_order_features = [0]*nvar + for index, var in enumerate(perm): + new_order_features[var] = ordered_features[index] + flatten_new_order_features = [elem for lst in new_order_features + for elem in lst] + all_symmetries.append(flatten_new_order_features) + return all_symmetries diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py index 8c90979..4a5f977 100644 --- a/packages/dataset_manipulation/dataset_manipulation.py +++ b/packages/dataset_manipulation/dataset_manipulation.py @@ -7,7 +7,7 @@ nvar = 3 -def augmentate_dataset(features, targets): +def augmentate_dataset(features, targets, timings): """ Multiply the size of the dataset by 6. @@ -17,13 +17,15 @@ def augmentate_dataset(features, targets): """ symmetric_features = [] symmetric_targets = [] - for features, target in zip(features, targets): + symmetric_timings = [] + for features, target, timing in zip(features, targets, timings): symmetric_features += give_all_symmetries(features, int(target)) symmetric_targets += list(range(math.factorial(nvar))) - return np.array(symmetric_features), np.array(symmetric_targets) + symmetric_timings += list(timing) + return np.array(symmetric_features), np.array(symmetric_targets), np.array(symmetric_timings) -def balance_dataset(features, targets): +def balance_dataset(features, targets, timings): """ Balance the dataset so all targets are almost equally common. @@ -33,13 +35,15 @@ def balance_dataset(features, targets): """ balanced_features = [] balanced_targets = [] - for features, target in zip(features, targets): + balanced_timings = [] + for features, target, timing in zip(features, targets, timings): symmetric_features = give_all_symmetries(features, int(target)) possible_targets = list(range(math.factorial(nvar))) new_target = random.choice(possible_targets) balanced_features.append(symmetric_features[new_target]) balanced_targets.append(new_target) - return np.array(balanced_features), np.array(balanced_targets) + balanced_timings.append(timing[new_target]) + return np.array(balanced_features), np.array(balanced_targets), np.array(balanced_timings) def name_unique_features(names, features): @@ -57,7 +61,13 @@ def name_unique_features(names, features): for ex_feature in new_features]) or np.std(feature) == 0): rep += 1 + elif feature.count(feature[0])==len(feature): + print(names[index]) else: + # if 'max_in_polys_max_sig'==names[index][:20]: + # print("Check ", feature.count(feature[0])==len(feature)) + # print(names[index]) + # print(len(feature)) new_features.append(feature) new_names.append(names[index]) return new_names diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py new file mode 100644 index 0000000..798778a --- /dev/null +++ b/replicating_Dorians_features.py @@ -0,0 +1,71 @@ + +import itertools +from xml.sax.handler import all_features +import numpy as np + +nvar=3 + + + + +def aveg(given_list): + return sum(given_list)/len(given_list) + +def aveg_not_zero(given_list): + return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0])) + +def identity(input): + return input + +def sign(input): + if type(input)==list: + return [sign(elem) for elem in input] + else: + if input>0: + return 1 + elif input<0: + return -1 + elif input==0: + return 0 + else: + raise Exception("How is this possible?") + + +def create_features(degrees, variable=0, sv=False): + functions = [sum, max, aveg, aveg_not_zero] + sign_or_not = [identity, sign] + features = [] + features_names = [] + for choice in itertools.product(functions, sign_or_not, functions, sign_or_not): + feature_description = choice[0].__name__+"sign"*(choice[1].__name__=="sign")+"_in_polys_"+choice[2].__name__+"_"+"sign"*(choice[3].__name__=="sign")+"of_" + "sum_of_"*sv+"degrees_of_var_"+str(variable)+"_in_monomials" + feature_value = choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) for degrees_in_poly in degrees])) + features.append(feature_value) + features_names.append(feature_description) + return features, features_names + + +def extract_features(dataset): + all_features = [] + all_targets = [] + all_timings = [] + all_original_polynomials = [] + for index, all_projections in enumerate(dataset[0]): + original_polynomials = all_projections[0][0] + all_original_polynomials.append(original_polynomials) + names = [] + instance_features = [] + all_targets.append(dataset[1][index]) + all_timings.append(dataset[2][index]) + for var in range(nvar): + degrees = [[monomial[var] for monomial in poly] + for poly in original_polynomials] + var_features, var_features_names = create_features(degrees, + variable=var) + instance_features += var_features + names += var_features_names + sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials] + svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True) + instance_features += svar_features + names += svar_features_names + all_features.append(instance_features) + return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings) diff --git a/test_train_datasets.py b/test_train_datasets.py new file mode 100644 index 0000000..9a80cff --- /dev/null +++ b/test_train_datasets.py @@ -0,0 +1,100 @@ +""" +The experiments in [1] are replicated with some changes. + +The first change is that the testing data is balanced, so that all targets +are almost equally common. +Then we use three training sets; dataset as in [1], balanced dataset +and data augmentation dataset. + +[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline +to Pick the Variable Ordering for Algorithms with Polynomial Inputs. +Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds) +Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, +vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 +""" + + +import os +import pickle +import random +import csv +import yaml +import importlib.util +# Check if 'dataset_manipulation' is installed +if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): + from dataset_manipulation import name_unique_features + from dataset_manipulation import remove_notunique_features + from dataset_manipulation import balance_dataset + from dataset_manipulation import augmentate_dataset +else: + from packages.dataset_manipulation import name_unique_features + from packages.dataset_manipulation import remove_notunique_features + from packages.dataset_manipulation import balance_dataset + from packages.dataset_manipulation import augmentate_dataset +from sklearn.preprocessing import normalize +from sklearn.model_selection import train_test_split + + +def count_instances(my_dataset, instance): + return sum(my_dataset==instance) + + +names_features_targets_file = os.path.join(os.path.dirname(__file__), + 'datasets', + 'clean_dataset.txt') +with open(names_features_targets_file, 'rb') as f: + original_polys, names, features, targets, timings = pickle.load(f) + +augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings) + +normalized_augmented_features = normalize(augmented_features) +unique_names = name_unique_features(names, + augmented_features) + +random_state = 0 + +x = dict() # to keep the features +y = dict() # to keep the labels +t = dict() # to keep the timings +# train and test sets are created +not_unique_x_normal_train, not_unique_x_normal_test, y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(features, targets, timings, + test_size=0.20, + random_state=random_state) + +not_unique_balanced_x_test, y['test_balanced'], t['test_balanced'] = balance_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal']) +x['test_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_test) +# testing data for all approaches is ready +# all tests will be done in balanced but the others are also computed +not_unique_augmented_x_test, y['test_augmented'], t['test_augmented'] = augmentate_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal']) +x['test_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_test) +x['test_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_test) + +x['train_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_train) +# normal training data ready +not_unique_balanced_x_train, y['train_balanced'], t['train_balanced'] = balance_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal']) +x['train_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_train) +# balanced training data ready +not_unique_augmented_x_train, y['train_augmented'], t['train_augmented'] = augmentate_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal']) +x['train_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_train) +# augmented training data ready + + +dataset_info_file = os.path.join(os.path.dirname(__file__), + 'datasets', + 'dataset_instances.csv') +with open(dataset_info_file, 'w') as f_dataset_info: + writer = csv.writer(f_dataset_info) + writer.writerow(['dataset'] + ['zero','one','two','three','four','five','total']) + for usage in ['train', 'test']: + for method in ['normal', 'balanced', 'augmented']: + print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}'])) + this_dataset_file = os.path.join(os.path.dirname(__file__), + 'datasets', usage, + f'{method}_{usage}_dataset.txt') + with open(this_dataset_file, 'wb') as f: + pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f) + + writer.writerow([f'{usage} {method} dataset'] + + [str(count_instances(y[f'{usage}_{method}'], i)) + for i in range(6)] + + [str(len(y[f'{usage}_{method}']))]) \ No newline at end of file diff --git a/train_models.py b/train_models.py new file mode 100644 index 0000000..abebcd1 --- /dev/null +++ b/train_models.py @@ -0,0 +1,15 @@ +import yaml +from yaml import UnsafeLoader +import os +from config.ml_models import ml_models +from config.ml_models import dataset_types + +print(ml_models) +for ml_model in ml_models: + for method in dataset_types: + filename = os.path.join(os.path.dirname(__file__), + 'config', 'hyperparams', + f'{method}_{ml_model}.yaml') + with open(filename, 'r') as f: + hyperparameters = yaml.load(f, Loader=UnsafeLoader) + print(type(hyperparameters), hyperparameters)