Skip to content

Commit

Permalink
Bug in augmentating dataset detected, results looking amazing
Browse files Browse the repository at this point in the history
Tereso del Rio committed Sep 16, 2023
1 parent 7a8a578 commit 427c13d
Showing 13 changed files with 294 additions and 99 deletions.
Binary file modified datasets/test/augmented_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/balanced_test_dataset.txt
Binary file not shown.
Binary file modified datasets/train/augmented_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/balanced_train_dataset.txt
Binary file not shown.
7 changes: 7 additions & 0 deletions find_filename.py
Original file line number Diff line number Diff line change
@@ -47,3 +47,10 @@ def find_output_filename(training_method):
def find_other_filename(search):
return os.path.join(os.path.dirname(__file__), 'config',
f'{search}.txt')


import pickle
names_filename = find_other_filename('unique_names')
with open(names_filename, 'rb') as names_f:
names = pickle.load(names_f)
print(len(names), '\n', names[2], '\n', names[67], '\n', names[132])
127 changes: 127 additions & 0 deletions from_poly_set_to_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""This file will contain the functions necessary to convert
a list of sets of polynomials to a list of their features.
This features will be unique and standarised"""
import numpy as np
import pickle
from packages.dataset_manipulation import augmentate_dataset
from find_filename import find_other_filename
from replicating_Dorians_features import features_from_set_of_polys


def poly_set_feature_extractor(sets_of_polys, determine_unique_features=False,
determine_standarization=False):
"""Given a list of polynomial sets will return a list of its features"""
features_list = []
for set_of_polys in sets_of_polys:
names, features = features_from_set_of_polys(set_of_polys)
features_list.append(features)
if determine_unique_features:
# if we want to find unique feature names
find_unique_features(names, features_list)
unique_names, unique_features = get_unique_features(names, features_list)
if determine_standarization:
find_standarizing_values(unique_names, unique_features)
standarized_features = get_standarized_features(unique_names, unique_features)
return names, standarized_features


# def features_set_of_polys(original_polynomials):
# instance_features = []
# names = []
# nvar = len(original_polynomials[0][0]) - 1
# for var in range(nvar):
# degrees = [[monomial[var] for monomial in poly]
# for poly in original_polynomials]
# var_features, var_features_names = create_features(degrees,
# variable=var)
# instance_features += var_features
# names += var_features_names
# sdegrees = [[sum(monomial) for monomial in poly
# if monomial[var]!=0]+[0]
# for poly in original_polynomials]
# svar_features, svar_features_names = create_features(sdegrees,
# variable=var,
# sv=True)
# instance_features += svar_features
# names += svar_features_names
# return names, instance_features


def find_unique_features(names, features):
"""
Saves the name of unique features in the assigned file.

When two features share the same value for all the instances,
or they are the same after adition or multiplication,
one of them is not considered unique.
"""
# we want to look for uniqueness after augmenting to discard
# some that might look equal
# creating labels and timing for the augmentate_dataset function
labels = [0]*len(features)
timings = [[0, 0]]*len(features)
augmented_features, _, _ = augmentate_dataset(features, labels, timings)
# now we look for the unique features
unique_features = []
unique_names = []
for index, feature in enumerate(zip(*augmented_features)):
if (any([np.array_equal(feature, ex_feature)
for ex_feature in unique_features])
or np.std(feature) == 0):
# check if this feature has been already recorded
pass
elif feature.count(feature[0]) == len(feature):
# check if it is a constant list
pass
else:
# if none of the previous conditions then
unique_features.append(feature)
unique_names.append(names[index])
unique_names_filename = find_other_filename('unique_names')
with open(unique_names_filename, 'wb') as unique_names_file:
pickle.dump(unique_names, unique_names_file)


def get_unique_features(names, features):
"""Return the features corresponding to a name in 'unique_names'."""
# We recover the list of unique feature names
unique_names_filename = find_other_filename('unique_names')
with open(unique_names_filename, 'rb') as unique_names_file:
unique_names = pickle.load(unique_names_file)
# we keep only the features that are unique
unique_features = []
index = 0
for feature in zip(*features):
if names[index] in unique_names:
unique_features.append(feature)
index += 1
return unique_names, np.transpose(unique_features)


def find_standarizing_values(names, features_list):
"""Finds and saves the mean and std of the different features
so that features can be standarised in a consistent way
before giving them to the machine learning models"""
standarizing_values = dict()
for name, features in zip(names, features_list):
standarizing_values[name] = (np.mean(features), np.std(features))
standarizing_values_filename = find_other_filename('standarizing_values')
with open(standarizing_values_filename, 'wb') as standarizing_values_file:
pickle.dump(standarizing_values, standarizing_values_file)


def get_standarized_features(names, features):
"""Returns the standarised features."""
# We recover the list of unique feature names
standarizing_values_filename = find_other_filename('standarizing_values')
with open(standarizing_values_filename, 'rb') as standarizing_values_file:
standarizing_values = pickle.load(standarizing_values_file)
# we keep only the features that are unique
standarized_features = []
# for featurex in zip(*features):
# print(type(featurex), len(features))
index = 0
for index, feature in enumerate(zip(*features)):
mean, std = standarizing_values[names[index]]
standarized_features.append((feature-mean)/std)
return np.transpose(standarized_features)
12 changes: 6 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
@@ -30,7 +30,7 @@
# Hyperparameter tuning take a very long time,
# if tune_hyperparameters is used to decide whether to tune them
# or to used previously tuned
tune_hyperparameters = True
tune_hyperparameters = False
paradigm = 'classification'

# cleaning_dataset()
@@ -41,11 +41,11 @@
for method in dataset_qualities:
print(f"Choosing hyperparameters for {ml_model} in {method}")
choose_hyperparams(ml_model, method)
for ml_model in ml_models:
print(f"Training {ml_model}")
for method in dataset_qualities:
print(f"for {method}")
train_model(ml_model, method)
# for ml_model in ml_models:
# print(f"Training {ml_model}")
# for method in dataset_qualities:
# print(f"for {method}")
# train_model(ml_model, method)
training_method = 'augmented'
testing_method = 'augmented'
first_time = 1
116 changes: 72 additions & 44 deletions main_heuristics.py
Original file line number Diff line number Diff line change
@@ -2,33 +2,61 @@
import math
import pickle
import random
import numpy as np
from Heuristics.heuristics_guess import not_greedy_heuristic_guess
from Heuristics.heuristics_guess import choose_order_given_projections
# import numpy as np
# from Heuristics.heuristics_guess import not_greedy_heuristic_guess
# from Heuristics.heuristics_guess import choose_order_given_projections
from find_filename import find_dataset_filename
from test_models import compute_metrics

nvar = 3
testing_method = 'Normal'
testing_method = 'Augmented'
test_dataset_filename = find_dataset_filename('Test',
testing_method)
with open(test_dataset_filename, 'rb') as test_dataset_file:
testing_dataset = pickle.load(test_dataset_file)
output_file = "heuristics_output_acc_time.csv"


# TESTING GMODS IN AUUGMENTED : Features 2, 67 and 132
def choose_gmods(features):
a = []
# print(features)
a.append(features[2])
a.append(features[67])
a.append(features[132])
if a[0]==min(a):
if a[1]<=a[2]:
return 0
else:
return 1
elif a[1]==min(a):
if a[0]<=a[2]:
return 2
else:
return 3
elif a[2]==min(a):
if a[0]<=a[1]:
return 4
else:
return 5

# Testing in heuristics that make all the choice at once
first_heuristic = 1
for heuristic in ['gmods', 'brown', 'random', 'virtual best']:
reps = 100
# for heuristic in ['T1', 'gmods', 'brown', 'random', 'virtual best']:
for heuristic in ['gmods', 'virtual best']:
reps = 10
sum_metrics = dict()
for i in range(reps):
if heuristic == 'virtual best':
chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
# chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
chosen_indices = testing_dataset['labels']
elif heuristic == 'random':
chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
else:
chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic)
for projection in testing_dataset['projections']]
# chosen_indices = [not_greedy_heuristic_guess(projection[0][0], heuristic)
# for projection in testing_dataset['projections']]
chosen_indices = [choose_gmods(features)
for features in testing_dataset['features']]
metrics = compute_metrics(chosen_indices,
testing_dataset['labels'],
testing_dataset['timings'],
@@ -38,8 +66,8 @@
else:
sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}
augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(1)*aveg_metrics[key] for key in sum_metrics}

print(heuristic, augmented_metrics)
if first_heuristic == 1:
first_heuristic = 0
@@ -51,37 +79,37 @@
writer = csv.writer(f)
writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])

# Testing on greedy heuristics
for heuristic in ['brown', 'gmods', 'random', 'virtual best']:
reps = 100
sum_metrics = dict()
for i in range(reps):
if heuristic == 'virtual best':
chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
elif heuristic == 'random':
chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
else:
chosen_indices = [choose_order_given_projections(projection, heuristic)
for projection in testing_dataset['projections']]
metrics = compute_metrics(chosen_indices,
testing_dataset['labels'],
testing_dataset['timings'],
testing_dataset['cells'])
if len(sum_metrics) == 0:
sum_metrics = metrics
else:
sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}
# # Testing on greedy heuristics
# for heuristic in ['brown', 'gmods', 'random', 'virtual best']:
# reps = 100
# sum_metrics = dict()
# for i in range(reps):
# if heuristic == 'virtual best':
# chosen_indices = [np.argmin(timings) for timings in testing_dataset['timings']]
# elif heuristic == 'random':
# chosen_indices = [random.randint(0, 5) for timings in testing_dataset['timings']]
# else:
# chosen_indices = [choose_order_given_projections(projection, heuristic)
# for projection in testing_dataset['projections']]
# metrics = compute_metrics(chosen_indices,
# testing_dataset['labels'],
# testing_dataset['timings'],
# testing_dataset['cells'])
# if len(sum_metrics) == 0:
# sum_metrics = metrics
# else:
# sum_metrics = {key: metrics[key] + sum_metrics[key] for key in metrics}
# aveg_metrics = {key: sum_metrics[key]/reps for key in sum_metrics}
# augmented_metrics = {key: aveg_metrics[key] if key in ['Accuracy', 'Markup'] else math.factorial(nvar)*aveg_metrics[key] for key in sum_metrics}

print(heuristic, augmented_metrics)
if first_heuristic == 1:
first_heuristic = 0
keys = list(augmented_metrics.keys())
with open(output_file, 'a') as f:
f.write('Now choosing greedily \n')
f.write(', '.join(['Model'] + keys) + '\n')
with open(output_file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])
# print(sum(min(timings) for timings in testing_dataset['timings']))
# print(heuristic, augmented_metrics)
# if first_heuristic == 1:
# first_heuristic = 0
# keys = list(augmented_metrics.keys())
# with open(output_file, 'a') as f:
# f.write('Now choosing greedily \n')
# f.write(', '.join(['Model'] + keys) + '\n')
# with open(output_file, 'a', newline='') as f:
# writer = csv.writer(f)
# writer.writerow([heuristic] + [augmented_metrics[key] for key in keys])
# # print(sum(min(timings) for timings in testing_dataset['timings']))
6 changes: 3 additions & 3 deletions main_regression.py
Original file line number Diff line number Diff line change
@@ -26,13 +26,13 @@
# Hyperparameter tuning take a very long time,
# if tune_hyperparameters is used to decide whether to tune them
# or to used previously tuned
tune_hyperparameters = True
tune_hyperparameters = False
taking_logarithms = False

for i in range(1):
# cleaning_dataset()
# create_train_test_datasets()
# create_regression_datasets(taking_logarithms=taking_logarithms)
create_regression_datasets(taking_logarithms=taking_logarithms)

paradigm = "regression"
if tune_hyperparameters:
@@ -62,7 +62,7 @@
first_time = 0
keys = list(metrics.keys())
with open(output_file, 'a') as f:
f.write('No more cheating; no taking logarithms also\n')
f.write('After changing dataset\n')
f.write(', '.join(['Model'] + keys) + '\n')
with open(output_file, 'a', newline='') as f:
writer = csv.writer(f)
79 changes: 49 additions & 30 deletions packages/dataset_manipulation/dataset_manipulation.py
Original file line number Diff line number Diff line change
@@ -4,35 +4,53 @@
import random
from .exploit_symmetries import give_all_symmetries
from .exploit_symmetries import augmentate_timings
from itertools import permutations
# from sklearn.preprocessing import normalize

nvar = 3


def augmentate_dataset(features, targets, timings, cells):
def augmentate_instance(features, timings, cells, nvar):
variables = list(range(nvar))
split_features = [features[i*len(features)//nvar:(i+1)*len(features)//nvar]
for i in range(nvar)]
dict_timings = {str(perm): timing for perm, timing
in zip(permutations(variables), timings)}
dict_cells = {str(perm): cell for perm, cell in zip(permutations(variables), cells)}
augmented_features, augmented_timings, augmented_cells = [], [], []
for perm in permutations(variables):
augmented_features.append([feature for i in perm
for feature in split_features[i]])
augmented_timings.append([dict_timings[str(double_perm)]
for double_perm in permutations(perm)])
augmented_cells.append([dict_cells[str(double_perm)]
for double_perm in permutations(perm)])
return augmented_features, augmented_timings, augmented_cells



def augmentate_dataset(all_features, all_timings, all_cells, nvar):
"""
Multiply the size of the dataset by 6.
Multiply the size of the dataset by math.factorial(nvar).

Arguments:
features: list(list(numpy.float))
targets: list(numpy.float)
"""
symmetric_features = []
symmetric_targets = []
symmetric_timings = []
symmetric_cells = []
for features, target, timing, cell in \
zip(features, targets, timings, cells):
symmetric_features += give_all_symmetries(features, int(target))
symmetric_targets += list(range(math.factorial(nvar)))
symmetric_timings += augmentate_timings(timing, int(target))
symmetric_cells += augmentate_timings(cell, int(target))

return np.array(symmetric_features), np.array(symmetric_targets), \
np.array(symmetric_timings), np.array(symmetric_cells)


def balance_dataset(features, targets, timings, cells):
augmented_features = []
augmented_timings = []
augmented_cells = []
for features, timings, cells in \
zip(all_features, all_timings, all_cells):
new_features, new_timings, new_cells = \
augmentate_instance(features, timings, cells, nvar)
augmented_features += new_features
augmented_timings += new_timings
augmented_cells += new_cells
return augmented_features, augmented_timings, augmented_cells


def balance_dataset(all_features, all_timings, all_cells, nvar):
"""
Balance the dataset so all targets are almost equally common.

@@ -41,21 +59,22 @@ def balance_dataset(features, targets, timings, cells):
targets: list(numpy.float)
"""
balanced_features = []
balanced_targets = []
balanced_timings = []
balanced_cells = []
for features, target, timing, cell in \
zip(features, targets, timings, cells):
symmetric_features = give_all_symmetries(features, int(target))
symmetric_timings = augmentate_timings(timing, int(target))
symmetric_cells = augmentate_timings(cell, int(target))
for features, timings, cells in \
zip(all_features, all_timings, all_cells):
new_target = random.choice(list(range(math.factorial(nvar))))
balanced_features.append(symmetric_features[new_target])
balanced_targets.append(new_target)
balanced_timings.append(symmetric_timings[new_target])
balanced_cells.append(symmetric_cells[new_target])
return np.array(balanced_features), np.array(balanced_targets),\
np.array(balanced_timings), np.array(balanced_cells)
new_features, new_timings, new_cells = \
augmentate_instance(features, timings, cells, nvar)
balanced_features.append(new_features[new_target])
balanced_timings.append(new_timings[new_target])
balanced_cells.append(new_cells[new_target])
return balanced_features, balanced_timings, balanced_cells

# features = [1,2,3,4,5,6]
# timings = [10,20,30,40,50,60]
# cells = [21,32,43,54,65,76]
# print(balance_dataset([features], [timings], [cells], 3))


def name_unique_features(names, features):
30 changes: 23 additions & 7 deletions test_models.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
import csv
import math
import pickle
import importlib.util
import numpy as np
from sklearn import metrics
from config.general_values import dataset_qualities
from config.ml_models import ml_models
from config.ml_models import ml_regressors
from find_filename import find_output_filename
from find_filename import find_dataset_filename
from find_filename import find_model_filename
# Check if 'dataset_manipulation' is installed
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
from exploit_symmetries import give_all_symmetries
from dataset_manipulation import augmentate_instance
else:
from packages.dataset_manipulation.exploit_symmetries import give_all_symmetries
from packages.dataset_manipulation.dataset_manipulation import augmentate_instance


# def test_model(trained_model_filename, test_dataset_filename):
@@ -104,8 +106,15 @@ def test_model(ml_model, paradigm, testing_method='augmented'):
model = pickle.load(trained_model_file)
with open(test_dataset_filename, 'rb') as test_dataset_file:
testing_dataset = pickle.load(test_dataset_file)
chosen_indices = [return_regressor_choice(model, features)
for features in testing_dataset['features']]
print("here")
if ml_model in ml_regressors:
chosen_indices = [return_regressor_choice(model, features)
for features in testing_dataset['features']]
else:
chosen_indices = [model.predict([features])[0]
for features in testing_dataset['features']]
print(chosen_indices)
print("here2")
return compute_metrics(chosen_indices,
testing_dataset['labels'],
testing_dataset['timings'],
@@ -123,6 +132,7 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells):
zip(chosen_indices, labels, all_timings, all_cells):
if chosen_index == label:
correct += 1
print(timings, chosen_index)
if timings[chosen_index] not in [30, 60]:
metrics['Completed'] += 1
metrics['Total time'] += timings[chosen_index]
@@ -135,11 +145,17 @@ def compute_metrics(chosen_indices, labels, all_timings, all_cells):


def return_regressor_choice(model, features):
features_all_symmetries = give_all_symmetries(features)
nvar = 3 ## Make this better
made_up_timings = list(range(math.factorial(nvar)))
made_up_cells = list(range(math.factorial(nvar)))
augmentated_features, _, _ = \
augmentate_instance(features, made_up_timings, made_up_cells, nvar)
y_op = float('inf')
for index, x_features in enumerate(features_all_symmetries):
# print(x_features)
for index, x_features in enumerate(augmentated_features):
y_pred = model.predict([x_features])
########
# THIS IS NOT A LIST??
########
# print(y_pred)
if y_op > y_pred:
y_op = y_pred
10 changes: 7 additions & 3 deletions test_train_datasets.py
Original file line number Diff line number Diff line change
@@ -57,20 +57,24 @@ def create_train_test_datasets():
dataset['cells'],
test_size=0.20,
random_state=random_state)
keys = ['features', 'labels', 'timings', 'cells']
keys = ['features', 'timings', 'cells']
for purpose in purposes:
datasets[f'{purpose}_Balanced'] = \
{key: elem for key,
elem in zip(keys, balance_dataset(
*[datasets[f'{purpose}_Normal'][key2]
for key2 in keys]))
for key2 in keys], nvar=3)) ##CHOOSE NVAR WELL
}
datasets[f'{purpose}_Balanced']['labels'] = \
[timings.index(min(timings)) for timings in datasets[f'{purpose}_Balanced']['timings']]
datasets[f'{purpose}_Augmented'] = \
{key: elem for key,
elem in zip(keys, augmentate_dataset(
*[datasets[f'{purpose}_Normal'][key2]
for key2 in keys]))
for key2 in keys], nvar=3))
}
datasets[f'{purpose}_Augmented']['labels'] = \
[timings.index(min(timings)) for timings in datasets[f'{purpose}_Augmented']['timings']]
for purpose in purposes:
for quality in dataset_qualities:
this_dataset_filename = \
6 changes: 0 additions & 6 deletions train_models.py
Original file line number Diff line number Diff line change
@@ -48,11 +48,6 @@ def train_regression_model(ml_model, method):
# trained_model_filename = find_model_filename(method, ml_model, 'regression')
# with open(trained_model_filename, 'wb') as trained_model_file:
# pickle.dump(reg, trained_model_file)
print("Real")
print(train_dataset['timings'][10:20])
print("Predicted")
print(reg.predict(train_dataset['features'])[10:20])
print(metrics.mean_squared_error(reg.predict(train_dataset['features']), train_dataset['timings']))
return reg


@@ -130,7 +125,6 @@ def get_vars_features(polynomials):
unique_features_filename = find_other_filename("unique_features")
with open(unique_features_filename, 'wb') as unique_features_file:
unique_names = pickle.load(unique_features_file)
print(unique_names)
for var in range(nvar):
var_features, var_names = \
compute_features_for_var(polynomials, var)

0 comments on commit 427c13d

Please sign in to comment.