Skip to content

Commit

Permalink
Now the times in the dataset are returned correctly
Browse files Browse the repository at this point in the history
Tereso del Rio committed Jun 21, 2023
1 parent d42164a commit 57adcb1
Showing 10 changed files with 93 additions and 45 deletions.
21 changes: 12 additions & 9 deletions create_clean_dataset.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,6 @@
the sets of polynomials and its timings for each order, creates a dataset
containing a set of unique features and its class"""

import os
import pickle
import numpy as np
from replicating_Dorians_features import extract_features
@@ -12,6 +11,7 @@
else:
from packages.dataset_manipulation import remove_notunique_features
from from_poly_set_to_features import poly_set_feature_extractor
from find_filename import find_dataset_filename


def create_dataframe(dataset):
@@ -22,9 +22,10 @@ def create_dataframe(dataset):
for index, all_projections in enumerate(dataset[0]):
original_polynomials = all_projections[0][0]
all_original_polynomials.append(original_polynomials)
names, all_features = poly_set_feature_extractor(all_original_polynomials,
determine_standarization=True,
determine_unique_features=True)
names, all_features =\
poly_set_feature_extractor(all_original_polynomials,
determine_standarization=True,
determine_unique_features=True)
return np.array(all_original_polynomials), np.array(names),\
np.array(all_features), np.array(all_targets), np.array(all_timings)

@@ -34,14 +35,17 @@ def create_dataframe(dataset):
# 'dataset_without_repetition_return_ncells.txt')
# with open(dataset_filename, 'rb') as f:
# dataset = pickle.load(f)
# original_polys_list, names, features_list, targets_list, timings_list = create_dataframe(dataset)
# original_polys_list, names, features_list, targets_list, timings_list =\
# create_dataframe(dataset)


def cleaning_dataset(dataset_filename, clean_dataset_filename):
def cleaning_dataset():
dataset_filename = find_dataset_filename('unclean')
clean_dataset_filename = find_dataset_filename('clean')
with open(dataset_filename, 'rb') as f:
dataset = pickle.load(f)
original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)

original_polys_list, names, features_list, targets_list, timings_list =\
extract_features(dataset)
# working with raw features
features = np.array(features_list)
unique_names, unique_features = remove_notunique_features(names, features)
@@ -54,7 +58,6 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename):
unique_features, targets, timings),
clean_dataset_file)


# dataset_filename = os.path.join(os.path.dirname(__file__),
# 'DatasetsBeforeProcessing',
# 'dataset_without_repetition_return_ncells.txt')
4 changes: 2 additions & 2 deletions datasets/dataset_instances.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dataset,zero,one,two,three,four,five,total
train normal dataset,326,74,105,41,163,106,815
train balanced dataset,118,136,125,149,134,153,815
train balanced dataset,146,120,132,150,125,142,815
train augmented dataset,815,815,815,815,815,815,4890
test normal dataset,80,19,30,10,39,26,204
test balanced dataset,39,32,36,29,31,37,204
test balanced dataset,35,42,33,39,28,27,204
test augmented dataset,204,204,204,204,204,204,1224
Binary file modified datasets/test/augmented_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/balanced_test_dataset.txt
Binary file not shown.
Binary file modified datasets/train/augmented_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/balanced_train_dataset.txt
Binary file not shown.
46 changes: 25 additions & 21 deletions main.py
Original file line number Diff line number Diff line change
@@ -26,29 +26,33 @@
# Hyperparameter tuning take a very long time,
# if tune_hyperparameters is used to decide whether to tune them
# or to used previously tuned
tune_hyperparameters = False

original_dataset_file = find_dataset_filename('unclean')
clean_dataset_filename = find_dataset_filename('clean')
cleaning_dataset(original_dataset_file, clean_dataset_filename)
create_train_test_datasets()

if tune_hyperparameters:
for ml_model in ml_models:
for method in dataset_types:
print(f"Choosing hyperparameters for {ml_model} in {method}")
choose_hyperparams(ml_model, method)
for ml_model in ml_models:
print(f"Training {ml_model}")
for method in dataset_types:
print(f"for {method}")
train_model(ml_model, method)
for training_method in dataset_types:
print(f"Testing models trained in {training_method}")
test_results(training_method)
# tune_hyperparameters = False


# cleaning_dataset()
# create_train_test_datasets()

# if tune_hyperparameters:
# for ml_model in ml_models:
# for method in dataset_types:
# print(f"Choosing hyperparameters for {ml_model} in {method}")
# choose_hyperparams(ml_model, method)
# for ml_model in ml_models:
# print(f"Training {ml_model}")
# for method in dataset_types:
# print(f"for {method}")
# train_model(ml_model, method)
# for training_method in dataset_types:
# print(f"Testing models trained in {training_method}")
# test_results(training_method)

timings = dict()
model = 'SVC'
testing_method = 'Augmented'
for training_method in dataset_types:
print(f"Testing models trained in {training_method}")
print(timings_in_test(model, testing_method, training_method))
timings[training_method] = timings_in_test(model, testing_method, training_method)

from make_plots import survival_plot

survival_plot(timings)
14 changes: 8 additions & 6 deletions packages/dataset_manipulation/dataset_manipulation.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
import math
import random
from .exploit_symmetries import give_all_symmetries
from .exploit_symmetries import augmentate_timings
# from sklearn.preprocessing import normalize

nvar = 3
@@ -22,7 +23,8 @@ def augmentate_dataset(features, targets, timings):
for features, target, timing in zip(features, targets, timings):
symmetric_features += give_all_symmetries(features, int(target))
symmetric_targets += list(range(math.factorial(nvar)))
symmetric_timings += list(timing)
symmetric_timings += augmentate_timings(timing, int(target))

return np.array(symmetric_features), np.array(symmetric_targets), \
np.array(symmetric_timings)

@@ -40,11 +42,11 @@ def balance_dataset(features, targets, timings):
balanced_timings = []
for features, target, timing in zip(features, targets, timings):
symmetric_features = give_all_symmetries(features, int(target))
possible_targets = list(range(math.factorial(nvar)))
new_target = random.choice(possible_targets)
symmetric_timings = augmentate_timings(timing, int(target))
new_target = random.choice(list(range(math.factorial(nvar))))
balanced_features.append(symmetric_features[new_target])
balanced_targets.append(new_target)
balanced_timings.append(timing[new_target])
balanced_timings.append(symmetric_timings[new_target])
return np.array(balanced_features), np.array(balanced_targets),\
np.array(balanced_timings)

@@ -88,10 +90,10 @@ def get_unique_feature_names(unique_names, names, features):
return np.transpose(unique_features)


def remove_notunique_features(names, features):
def remove_notunique_features(names, features, nvar=3):
# creating some targets and timing because the function requires them
targets = [0]*len(features)
timings = [[0, 0]]*len(features)
timings = [list(range(math.factorial(nvar)))]*len(features)
augmented_features, _, _ = augmentate_dataset(features, targets, timings)
# normalized_augmented_features = normalize(augmented_features)
unique_names = name_unique_features(names, augmented_features)
49 changes: 42 additions & 7 deletions packages/dataset_manipulation/exploit_symmetries.py
Original file line number Diff line number Diff line change
@@ -14,32 +14,35 @@
"""
from itertools import permutations

nvar = 3
variables = list(range(nvar))
perms = [list(elem) for elem in permutations(variables)]

def get_perms(variables):
perms = [list(elem) for elem in permutations(variables)]
return perms

def features_to_canonical_target(features, optimal_ordering):

def features_to_canonical_target(features, optimal_ordering, nvar=3):
"""
Reorder the features for the target to be '1'.

This is done by reordering the features according to the optimal variable
ordering of the set of polynomials.
"""
variable_orderings = perms[optimal_ordering]
perms = get_perms(list(range(nvar)))
best_variable_ordering = perms[optimal_ordering]
nfeatures = len(features)
split_features = [features[int(var*nfeatures/nvar):
int((var+1)*nfeatures/nvar)]
for var in range(nvar)]
ordered_features = [split_features[variable_orderings[i]]
ordered_features = [split_features[best_variable_ordering[i]]
for i in range(nvar)]
return ordered_features


def give_all_symmetries(features, optimal_ordering):
def give_all_symmetries(features, optimal_ordering, nvar=3):
"""Reorder the features for all possible targets.
Returns a list of of all symmetries, the first one
corresponding to the optimal ordering"""
perms = get_perms(list(range(nvar)))
ordered_features = features_to_canonical_target(features,
optimal_ordering)
all_symmetries = []
@@ -51,3 +54,35 @@ def give_all_symmetries(features, optimal_ordering):
for elem in lst]
all_symmetries.append(flatten_new_order_features)
return all_symmetries


def augmentate_timings(timings, optimal_ordering, nvar=3):
"""Given all the timings returns a list of all the possible reorderings
so that the first reordering corresponds to the optimal ordering and
the others follow that"""
perms = get_perms(list(range(nvar)))
best_variable_ordering = perms[optimal_ordering]
new_perms = get_perms(best_variable_ordering)
all_timings = []
for perm in new_perms:
# compute in which index this perm used to be
perm_index = perms.index(perm)
# find associated timing and append
all_timings.append(reorder_timings(timings, perm_index, nvar=3))
return all_timings


def reorder_timings(timings, first_ordering, nvar=3):
"""Given all the timings reorder them so that the first one
corresponds to first_ordering and the rest from the usual
permutations done from it"""
perms = get_perms(list(range(nvar)))
first_variable_ordering = perms[first_ordering]
new_perms = get_perms(first_variable_ordering)
new_timings = []
for perm in new_perms:
# compute in which index this perm used to be
perm_index = perms.index(perm)
# find associated timing and append
new_timings.append(timings[perm_index])
return new_timings
4 changes: 4 additions & 0 deletions test_train_datasets.py
Original file line number Diff line number Diff line change
@@ -24,6 +24,9 @@ def create_train_test_datasets():
with open(clean_dataset_filename, 'rb') as clean_dataset_file:
_, names, features, targets, timings = pickle.load(clean_dataset_file)
unique_names, unique_features = remove_notunique_features(names, features)
# features were already unique because of create_clean_dataset
# decide where to remove the features
print("create_train_test", timings)
unique_features_filename = find_other_filename("unique_features")
with open(unique_features_filename, 'wb') as unique_features_file:
pickle.dump(unique_features_filename, unique_features_file)
@@ -35,6 +38,7 @@ def create_train_test_datasets():
x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
test_size=0.20,
random_state=random_state)

for purpose in ['train', 'test']:
x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])

0 comments on commit 57adcb1

Please sign in to comment.