-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
21 changed files
with
539 additions
and
41 deletions.
There are no files selected for viewing
Binary file added
BIN
+36.3 MB
DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
""" | ||
The experiments in [1] are replicated with some changes. | ||
|
||
The first change is that the testing data is balanced, so that all targets | ||
are almost equally common. | ||
Then we use three training sets; dataset as in [1], balanced dataset | ||
and data augmentation dataset. | ||
|
||
[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline | ||
to Pick the Variable Ordering for Algorithms with Polynomial Inputs. | ||
Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds) | ||
Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, | ||
vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 | ||
""" | ||
|
||
|
||
import os | ||
import pickle | ||
import csv | ||
import yaml | ||
import importlib.util | ||
from config.ml_models import ml_models | ||
from config.ml_models import classifiers | ||
from config.ml_models import dataset_types | ||
from config.hyperparameters_grid import grid | ||
from sklearn.model_selection import GridSearchCV | ||
|
||
|
||
def write_yaml_to_file(py_obj, filename): | ||
with open(f'{filename}.yaml', 'w',) as f: | ||
yaml.dump(py_obj, f, sort_keys=False) | ||
print('Written to file successfully') | ||
|
||
|
||
def k_folds_ml(x_train, y_train, model, random_state=0): | ||
""" | ||
Train the desired model. | ||
|
||
The hyperparameters of the models are chosen using 5-fold cross validation. | ||
""" | ||
current_classifier = classifiers[model] | ||
current_grid = grid[model] | ||
rf_cv = GridSearchCV(estimator=current_classifier(), | ||
param_grid=current_grid, | ||
cv=5) | ||
rf_cv.fit(x_train, y_train) | ||
return rf_cv.best_params_ | ||
|
||
|
||
test_balanced_dataset_file = os.path.join(os.path.dirname(__file__), | ||
'datasets', 'test', | ||
'balanced_test_dataset.txt') | ||
with open(test_balanced_dataset_file, 'rb') as g: | ||
balanced_x_test, balanced_y_test = pickle.load(g) | ||
|
||
test_normal_dataset_file = os.path.join(os.path.dirname(__file__), | ||
'datasets', 'test', | ||
'normal_test_dataset.txt') | ||
with open(test_normal_dataset_file, 'rb') as g: | ||
normal_x_test, normal_y_test = pickle.load(g) | ||
|
||
output_file_balanced = os.path.join(os.path.dirname(__file__), | ||
'ml_results_k_fold_tested_in_balanced.csv') | ||
with open(output_file_balanced, 'w') as f_balanced: | ||
writer_balanced = csv.writer(f_balanced) | ||
writer_balanced.writerow(["Name"] + dataset_types) | ||
output_file_normal = os.path.join(os.path.dirname(__file__), | ||
'ml_results_k_fold_tested_in_normal.csv') | ||
with open(output_file_normal, 'w') as f_normal: | ||
writer_normal = csv.writer(f_normal) | ||
writer_normal.writerow(["Name"] + dataset_types) | ||
for ml_model in ml_models: | ||
print(f"Model: {ml_model}") | ||
acc_balanced = dict() | ||
acc_normal = dict() | ||
for method in dataset_types: | ||
this_dataset_file = os.path.join(os.path.dirname(__file__), | ||
'datasets', 'train', | ||
f'{method}_train_dataset.txt') | ||
with open(this_dataset_file, 'rb') as f: | ||
method_x_train, method_y_train = pickle.load(f) | ||
hyperparams = k_folds_ml(method_x_train, method_y_train, | ||
model=ml_model) | ||
write_yaml_to_file(hyperparams, | ||
os.path.join(os.path.dirname(__file__), | ||
'config', 'hyperparams', | ||
f'{method}_{ml_model}')) | ||
current_classifier = classifiers[ml_model] | ||
clf = current_classifier(**hyperparams) | ||
clf.fit(method_x_train, method_y_train) | ||
acc_balanced[method] = clf.score(balanced_x_test, | ||
balanced_y_test) | ||
acc_normal[method] = clf.score(normal_x_test, normal_y_test) | ||
method_file = os.path.join(os.path.dirname(__file__), | ||
'config', 'models', | ||
f'{method}_trained_model.txt') | ||
with open(method_file, 'wb') as f_method: | ||
pickle.dump(clf, f_method) | ||
round_accuracies_balanced = [round(acc, 2) | ||
for acc in [acc_balanced[method_here] | ||
for method_here in dataset_types]] | ||
round_accuracies_normal = [round(acc, 2) | ||
for acc in [acc_normal[method_here] | ||
for method_here in dataset_types]] | ||
writer_balanced.writerow([ml_model] + round_accuracies_balanced) | ||
writer_normal.writerow([ml_model] + round_accuracies_normal) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import pickle | ||
import numpy as np | ||
from replicating_Dorians_features import extract_features | ||
from basic_ml import use_tf, basic_ml | ||
from itertools import product | ||
import sys | ||
import os | ||
import csv | ||
|
||
|
||
dataset_file = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt') | ||
f = open(dataset_file, 'rb') | ||
dataset = pickle.load(f) | ||
original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset) | ||
|
||
# working with raw features | ||
features = np.array(features_list) | ||
targets = np.array(targets_list) | ||
timings = np.array(timings_list) | ||
original_polys = np.array(original_polys_list) | ||
|
||
clean_dataset_file = os.path.join(os.path.dirname(__file__), | ||
'datasets', | ||
'clean_dataset.txt') | ||
g = open(clean_dataset_file, 'wb') | ||
dataset = pickle.dump((original_polys, names, features, targets, timings), g) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .dataset_manipulation import augmentate_dataset # noqa401 | ||
from .dataset_manipulation import balance_dataset # noqa401 | ||
from .dataset_manipulation import name_unique_features # noqa401 | ||
from .dataset_manipulation import remove_notunique_features # noqa401 |
72 changes: 72 additions & 0 deletions
72
packages/build/lib/dataset_manipulation/dataset_manipulation.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
"""Exploit symmetries in polynomials to augmentate or balance the dataset.""" | ||
import numpy as np | ||
import math | ||
import random | ||
from .exploit_symmetries import give_all_symmetries | ||
|
||
nvar = 3 | ||
|
||
|
||
def augmentate_dataset(features, targets): | ||
""" | ||
Multiply the size of the dataset by 6. | ||
|
||
Arguments: | ||
features: list(list(numpy.float)) | ||
targets: list(numpy.float) | ||
""" | ||
symmetric_features = [] | ||
symmetric_targets = [] | ||
for features, target in zip(features, targets): | ||
symmetric_features += give_all_symmetries(features, int(target)) | ||
symmetric_targets += list(range(math.factorial(nvar))) | ||
return np.array(symmetric_features), np.array(symmetric_targets) | ||
|
||
|
||
def balance_dataset(features, targets): | ||
""" | ||
Balance the dataset so all targets are almost equally common. | ||
|
||
Arguments: | ||
features: list(list(numpy.float)) | ||
targets: list(numpy.float) | ||
""" | ||
balanced_features = [] | ||
balanced_targets = [] | ||
for features, target in zip(features, targets): | ||
symmetric_features = give_all_symmetries(features, int(target)) | ||
possible_targets = list(range(math.factorial(nvar))) | ||
new_target = random.choice(possible_targets) | ||
balanced_features.append(symmetric_features[new_target]) | ||
balanced_targets.append(new_target) | ||
return np.array(balanced_features), np.array(balanced_targets) | ||
|
||
|
||
def name_unique_features(names, features): | ||
""" | ||
Return the name of unique features. | ||
|
||
When two features share the same value for all the instances | ||
one of them is not considered unique. | ||
""" | ||
new_features = [] | ||
new_names = [] | ||
rep = 0 | ||
for index, feature in enumerate(zip(*features)): | ||
if (any([np.array_equal(feature, ex_feature) | ||
for ex_feature in new_features]) | ||
or np.std(feature) == 0): | ||
rep += 1 | ||
else: | ||
new_features.append(feature) | ||
new_names.append(names[index]) | ||
return new_names | ||
|
||
|
||
def remove_notunique_features(unique_names, names, features): | ||
"""Return the features corresponding to a name in 'unique_names'.""" | ||
unique_features = [] | ||
for index, feature in enumerate(zip(*features)): | ||
if names[index] in unique_names: | ||
unique_features.append(feature) | ||
return np.transpose(unique_features) |
51 changes: 51 additions & 0 deletions
51
packages/build/lib/dataset_manipulation/exploit_symmetries.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
""" | ||
Exploit symmetries in three variable polynomials to generate up to six | ||
instances out of each existing one. | ||
|
||
The task at hand consists in classify this features. | ||
We will take advantage of the fact that we can change the target by | ||
reordering the features. | ||
|
||
This file will contain: | ||
- features_to_canonical_ordering: a function able to reorder the features so | ||
that the target becomes '1', this ordering is called the canonical order. | ||
- give_all_symmetries: a function that given the canonical order returns the | ||
reorderings for each of the possible targets '1','2',...,'6'. | ||
""" | ||
from itertools import permutations | ||
|
||
nvar = 3 | ||
variables = list(range(nvar)) | ||
perms = [list(elem) for elem in permutations(variables)] | ||
|
||
|
||
def features_to_canonical_target(features, optimal_ordering): | ||
""" | ||
Reorder the features for the target to be '1'. | ||
|
||
This is done by reordering the features according to the optimal variable | ||
ordering of the set of polynomials. | ||
""" | ||
variable_orderings = perms[optimal_ordering] | ||
nfeatures = len(features) | ||
split_features = [features[int(var*nfeatures/nvar): | ||
int((var+1)*nfeatures/nvar)] | ||
for var in range(nvar)] | ||
ordered_features = [split_features[variable_orderings[i]] | ||
for i in range(nvar)] | ||
return ordered_features | ||
|
||
|
||
def give_all_symmetries(features, optimal_ordering): | ||
"""Reorder the features for all possible targets.""" | ||
ordered_features = features_to_canonical_target(features, | ||
optimal_ordering) | ||
all_symmetries = [] | ||
for perm in perms: | ||
new_order_features = [0]*nvar | ||
for index, var in enumerate(perm): | ||
new_order_features[var] = ordered_features[index] | ||
flatten_new_order_features = [elem for lst in new_order_features | ||
for elem in lst] | ||
all_symmetries.append(flatten_new_order_features) | ||
return all_symmetries |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
|
||
import itertools | ||
from xml.sax.handler import all_features | ||
import numpy as np | ||
|
||
nvar=3 | ||
|
||
|
||
|
||
|
||
def aveg(given_list): | ||
return sum(given_list)/len(given_list) | ||
|
||
def aveg_not_zero(given_list): | ||
return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0])) | ||
|
||
def identity(input): | ||
return input | ||
|
||
def sign(input): | ||
if type(input)==list: | ||
return [sign(elem) for elem in input] | ||
else: | ||
if input>0: | ||
return 1 | ||
elif input<0: | ||
return -1 | ||
elif input==0: | ||
return 0 | ||
else: | ||
raise Exception("How is this possible?") | ||
|
||
|
||
def create_features(degrees, variable=0, sv=False): | ||
functions = [sum, max, aveg, aveg_not_zero] | ||
sign_or_not = [identity, sign] | ||
features = [] | ||
features_names = [] | ||
for choice in itertools.product(functions, sign_or_not, functions, sign_or_not): | ||
feature_description = choice[0].__name__+"sign"*(choice[1].__name__=="sign")+"_in_polys_"+choice[2].__name__+"_"+"sign"*(choice[3].__name__=="sign")+"of_" + "sum_of_"*sv+"degrees_of_var_"+str(variable)+"_in_monomials" | ||
feature_value = choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) for degrees_in_poly in degrees])) | ||
features.append(feature_value) | ||
features_names.append(feature_description) | ||
return features, features_names | ||
|
||
|
||
def extract_features(dataset): | ||
all_features = [] | ||
all_targets = [] | ||
all_timings = [] | ||
all_original_polynomials = [] | ||
for index, all_projections in enumerate(dataset[0]): | ||
original_polynomials = all_projections[0][0] | ||
all_original_polynomials.append(original_polynomials) | ||
names = [] | ||
instance_features = [] | ||
all_targets.append(dataset[1][index]) | ||
all_timings.append(dataset[2][index]) | ||
for var in range(nvar): | ||
degrees = [[monomial[var] for monomial in poly] | ||
for poly in original_polynomials] | ||
var_features, var_features_names = create_features(degrees, | ||
variable=var) | ||
instance_features += var_features | ||
names += var_features_names | ||
sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials] | ||
svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True) | ||
instance_features += svar_features | ||
names += svar_features_names | ||
all_features.append(instance_features) | ||
return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
""" | ||
The experiments in [1] are replicated with some changes. | ||
|
||
The first change is that the testing data is balanced, so that all targets | ||
are almost equally common. | ||
Then we use three training sets; dataset as in [1], balanced dataset | ||
and data augmentation dataset. | ||
|
||
[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline | ||
to Pick the Variable Ordering for Algorithms with Polynomial Inputs. | ||
Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds) | ||
Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, | ||
vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 | ||
""" | ||
|
||
|
||
import os | ||
import pickle | ||
import random | ||
import csv | ||
import yaml | ||
import importlib.util | ||
# Check if 'dataset_manipulation' is installed | ||
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): | ||
from dataset_manipulation import name_unique_features | ||
from dataset_manipulation import remove_notunique_features | ||
from dataset_manipulation import balance_dataset | ||
from dataset_manipulation import augmentate_dataset | ||
else: | ||
from packages.dataset_manipulation import name_unique_features | ||
from packages.dataset_manipulation import remove_notunique_features | ||
from packages.dataset_manipulation import balance_dataset | ||
from packages.dataset_manipulation import augmentate_dataset | ||
from sklearn.preprocessing import normalize | ||
from sklearn.model_selection import train_test_split | ||
|
||
|
||
def count_instances(my_dataset, instance): | ||
return sum(my_dataset==instance) | ||
|
||
|
||
names_features_targets_file = os.path.join(os.path.dirname(__file__), | ||
'datasets', | ||
'clean_dataset.txt') | ||
with open(names_features_targets_file, 'rb') as f: | ||
original_polys, names, features, targets, timings = pickle.load(f) | ||
|
||
augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings) | ||
|
||
normalized_augmented_features = normalize(augmented_features) | ||
unique_names = name_unique_features(names, | ||
augmented_features) | ||
|
||
random_state = 0 | ||
|
||
x = dict() # to keep the features | ||
y = dict() # to keep the labels | ||
t = dict() # to keep the timings | ||
# train and test sets are created | ||
not_unique_x_normal_train, not_unique_x_normal_test, y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(features, targets, timings, | ||
test_size=0.20, | ||
random_state=random_state) | ||
|
||
not_unique_balanced_x_test, y['test_balanced'], t['test_balanced'] = balance_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal']) | ||
x['test_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_test) | ||
# testing data for all approaches is ready | ||
# all tests will be done in balanced but the others are also computed | ||
not_unique_augmented_x_test, y['test_augmented'], t['test_augmented'] = augmentate_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal']) | ||
x['test_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_test) | ||
x['test_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_test) | ||
|
||
x['train_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_train) | ||
# normal training data ready | ||
not_unique_balanced_x_train, y['train_balanced'], t['train_balanced'] = balance_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal']) | ||
x['train_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_train) | ||
# balanced training data ready | ||
not_unique_augmented_x_train, y['train_augmented'], t['train_augmented'] = augmentate_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal']) | ||
x['train_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_train) | ||
# augmented training data ready | ||
|
||
|
||
dataset_info_file = os.path.join(os.path.dirname(__file__), | ||
'datasets', | ||
'dataset_instances.csv') | ||
with open(dataset_info_file, 'w') as f_dataset_info: | ||
writer = csv.writer(f_dataset_info) | ||
writer.writerow(['dataset'] + ['zero','one','two','three','four','five','total']) | ||
for usage in ['train', 'test']: | ||
for method in ['normal', 'balanced', 'augmented']: | ||
print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}'])) | ||
this_dataset_file = os.path.join(os.path.dirname(__file__), | ||
'datasets', usage, | ||
f'{method}_{usage}_dataset.txt') | ||
with open(this_dataset_file, 'wb') as f: | ||
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f) | ||
|
||
writer.writerow([f'{usage} {method} dataset'] | ||
+ [str(count_instances(y[f'{usage}_{method}'], i)) | ||
for i in range(6)] | ||
+ [str(len(y[f'{usage}_{method}']))]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import yaml | ||
from yaml import UnsafeLoader | ||
import os | ||
from config.ml_models import ml_models | ||
from config.ml_models import dataset_types | ||
|
||
print(ml_models) | ||
for ml_model in ml_models: | ||
for method in dataset_types: | ||
filename = os.path.join(os.path.dirname(__file__), | ||
'config', 'hyperparams', | ||
f'{method}_{ml_model}.yaml') | ||
with open(filename, 'r') as f: | ||
hyperparameters = yaml.load(f, Loader=UnsafeLoader) | ||
print(type(hyperparameters), hyperparameters) |