Skip to content

Commit

Permalink
More metrics can be computed now
Browse files Browse the repository at this point in the history
  • Loading branch information
Tereso del Rio committed Sep 12, 2023
1 parent e9227ce commit 051d649
Showing 13 changed files with 142 additions and 73 deletions.
34 changes: 20 additions & 14 deletions create_clean_dataset.py
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@
from packages.dataset_manipulation import remove_notunique_features
from from_poly_set_to_features import poly_set_feature_extractor
from find_filename import find_dataset_filename
from find_filename import find_other_filename


def create_dataframe(dataset):
@@ -44,21 +45,26 @@ def cleaning_dataset():
clean_dataset_filename = find_dataset_filename('clean')
with open(dataset_filename, 'rb') as f:
dataset = pickle.load(f)
original_polys_list, names, features_list, targets_list, timings_list =\
extract_features(dataset)
# working with raw features
features = np.array(features_list)
unique_names, unique_features = remove_notunique_features(names, features)

targets = np.array(targets_list)
timings = np.array([[convert_to_timing(timings_ordering)
for timings_ordering in timings_problem]
for timings_problem in timings_list])
original_polys = np.array(original_polys_list)
my_dataset = extract_features(dataset)
clean_dataset = dict()
# # working with raw features
# features = np.array(features_list)
clean_dataset['names'], clean_dataset['features'] = \
remove_notunique_features(my_dataset['names'],
my_dataset['features'])
unique_features_filename = find_other_filename("unique_features")
with open(unique_features_filename, 'wb') as unique_features_file:
pickle.dump(clean_dataset['names'], unique_features_file)
# Some timings are expressed as "Over 30", this is changed here
clean_dataset['timings'] = \
np.array([[convert_to_timing(timings_ordering)
for timings_ordering in timings_problem]
for timings_problem in my_dataset['timings']])
for key in my_dataset:
if key not in clean_dataset:
clean_dataset[key] = my_dataset[key]
with open(clean_dataset_filename, 'wb') as clean_dataset_file:
dataset = pickle.dump((original_polys, unique_names,
unique_features, targets, timings),
clean_dataset_file)
pickle.dump(clean_dataset, clean_dataset_file)

# dataset_filename = os.path.join(os.path.dirname(__file__),
# 'DatasetsBeforeProcessing',
Binary file modified datasets/clean_dataset.txt
Binary file not shown.
4 changes: 2 additions & 2 deletions datasets/dataset_instances.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dataset,zero,one,two,three,four,five,total
train normal dataset,326,74,105,41,163,106,815
train balanced dataset,125,140,134,121,136,159,815
train balanced dataset,137,123,145,150,122,138,815
train augmented dataset,815,815,815,815,815,815,4890
test normal dataset,80,19,30,10,39,26,204
test balanced dataset,30,38,36,36,30,34,204
test balanced dataset,30,30,41,35,36,32,204
test augmented dataset,204,204,204,204,204,204,1224
Binary file modified datasets/test/augmented_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/balanced_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/normal_test_dataset.txt
Binary file not shown.
Binary file modified datasets/train/augmented_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/balanced_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/normal_train_dataset.txt
Binary file not shown.
68 changes: 41 additions & 27 deletions main.py
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@
Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
"""
import csv
from config.ml_models import ml_models
from config.ml_models import dataset_types
from find_filename import find_dataset_filename
@@ -29,7 +30,7 @@
# if tune_hyperparameters is used to decide whether to tune them
# or to used previously tuned
# tune_hyperparameters = False

paradigm = 'classification'

# cleaning_dataset()
# create_train_test_datasets()
@@ -44,31 +45,44 @@
# for method in dataset_types:
# print(f"for {method}")
# train_model(ml_model, method)
# for training_method in dataset_types:
# print(f"Testing models trained in {training_method}")
# test_results(training_method)

timings = dict()
training_method = 'augmented'
testing_method = 'augmented'
test_dataset_filename = find_dataset_filename('test',
testing_method)

with open("classification_output_timings.csv", 'w') as f:
f.write("model, Normal, Balanced, Augmented\n")
first_time = 1
output_file = "classification_output_acc_time.csv"
for ml_model in ml_models:
for training_method in dataset_types:
trained_model_filename = find_model_filename(training_method,
ml_model)
accuracy = test_model(trained_model_filename,
test_dataset_filename)
timings[training_method] = timings_in_test(ml_model, testing_method,
training_method)
total_time = sum(timings[training_method])
# with open("classification_output_acc_time.csv", 'a') as f:
# f.write(f"{ml_model}, {accuracy}, {total_time}\n")
with open("classification_output_timings.csv", 'a') as f:
f.write(f"{ml_model}, {sum(timings['Normal'])}, {sum(timings['Balanced'])}, {sum(timings['Augmented'])}\n")
timings['optimal'] = timings_in_test('optimal', testing_method)
print(sum(timings['optimal']))
from make_plots import survival_plot
survival_plot(timings, plot_name=f"survival_plot_{ml_model}")
print(f"Testing models trained in {training_method}")
metrics = test_model(ml_model, paradigm=training_method, testing_method=testing_method)
if first_time == 1:
first_time = 0
keys = list(metrics.keys())
with open(output_file, 'a') as f:
f.write(', '.join(['Model'] + keys) + '\n')
with open(output_file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow([ml_model] + [metrics[key] for key in keys])


# timings = dict()
# testing_method = 'augmented'
# test_dataset_filename = find_dataset_filename('test',
# testing_method)

# with open("classification_output_timings.csv", 'w') as f:
# f.write("model, Normal, Balanced, Augmented\n")
# for ml_model in ml_models:
# for training_method in dataset_types:
# trained_model_filename = find_model_filename(training_method,
# ml_model)
# accuracy = test_model(trained_model_filename,
# test_dataset_filename)
# timings[training_method] = timings_in_test(ml_model, testing_method,
# training_method)
# total_time = sum(timings[training_method])
# # with open("classification_output_acc_time.csv", 'a') as f:
# # f.write(f"{ml_model}, {accuracy}, {total_time}\n")
# with open("classification_output_timings.csv", 'a') as f:
# f.write(f"{ml_model}, {sum(timings['Normal'])}, {sum(timings['Balanced'])}, {sum(timings['Augmented'])}\n")
# timings['optimal'] = timings_in_test('optimal', testing_method)
# print(sum(timings['optimal']))
# from make_plots import survival_plot
# survival_plot(timings, plot_name=f"survival_plot_{ml_model}")
60 changes: 46 additions & 14 deletions replicating_Dorians_features.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""IS THIS BEING USED?"""
"""
IS THIS BEING USED?
YES, IT IS!
"""

import itertools
from xml.sax.handler import all_features
# from xml.sax.handler import all_features
import numpy as np


@@ -9,7 +13,8 @@ def aveg(given_list):


def aveg_not_zero(given_list):
return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0]))
return sum(given_list)/max(1, len([1 for elem in given_list
if elem != 0]))


def identity(input):
@@ -30,34 +35,57 @@ def sign(input):
raise Exception("How is this possible?")


def create_features(degrees, variable=0, sv=False):
functions = [sum, max, aveg, aveg_not_zero]
def create_features(degrees, variable=0, sv=False,
include_aveg_not_zero=False):
if include_aveg_not_zero:
functions = [sum, max, aveg, aveg_not_zero]
else:
functions = [sum, max, aveg] # , aveg_not_zero]
sign_or_not = [identity, sign]
features = []
features_names = []
for choice in itertools.product(functions, sign_or_not, functions, sign_or_not):
feature_description = choice[0].__name__+"sign"*(choice[1].__name__=="sign")+"_in_polys_"+choice[2].__name__+"_"+"sign"*(choice[3].__name__=="sign")+"of_" + "sum_of_"*sv+"degrees_of_var_"+str(variable)+"_in_monomials"
feature_value = choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) for degrees_in_poly in degrees]))
for choice in itertools.product(functions,
sign_or_not, functions,
sign_or_not):
feature_description = (choice[0].__name__
+ "sign" * (choice[1].__name__ == "sign")
+ "_in_polys_" + choice[2].__name__ + "_"
+ "sign" * (choice[3].__name__ == "sign")
+ "of_" + "sum_of_" * sv + "degrees_of_var_"
+ str(variable) + "_in_monomials")
feature_value = \
choice[0](choice[1]([choice[2](choice[3](degrees_in_poly))
for degrees_in_poly in degrees]))
features.append(feature_value)
features_names.append(feature_description)
return features, features_names


def extract_features(dataset):
my_dataset = dict()
all_features = []
all_targets = []
all_timings = []
all_original_polynomials = []
for index, all_projections in enumerate(dataset[0]):
original_polynomials = all_projections[0][0]
all_projections = []
for index, projections in enumerate(dataset[0]):
all_projections.append(projections)
original_polynomials = projections[0][0]
# the original polynomials are the initial polynomials of any
# of the possible projections (also of the first one)
all_original_polynomials.append(original_polynomials)
all_targets.append(dataset[1][index])
all_timings.append(dataset[2][index])
names, instance_features = features_from_set_of_polys(original_polynomials)
names, instance_features = features_from_set_of_polys(
original_polynomials)
all_features.append(instance_features)
return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings)
my_dataset['polynomials'] = np.array(all_original_polynomials)
my_dataset['names'] = np.array(names)
my_dataset['features'] = np.array(all_features)
my_dataset['targets'] = np.array(all_targets)
my_dataset['timings'] = np.array(all_timings)
my_dataset['projections'] = np.array(all_projections)
return my_dataset


def features_from_set_of_polys(original_polynomials):
@@ -71,8 +99,12 @@ def features_from_set_of_polys(original_polynomials):
variable=var)
instance_features += var_features
names += var_features_names
sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials]
svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True)
sdegrees = \
[[sum(monomial) for monomial in poly if monomial[var] != 0] + [0]
for poly in original_polynomials]
svar_features, svar_features_names = create_features(sdegrees,
variable=var,
sv=True)
instance_features += svar_features
names += svar_features_names
return names, instance_features
42 changes: 27 additions & 15 deletions test_train_datasets.py
Original file line number Diff line number Diff line change
@@ -23,23 +23,29 @@ def count_instances(my_dataset, instance):
def create_train_test_datasets():
clean_dataset_filename = find_dataset_filename('clean')
with open(clean_dataset_filename, 'rb') as clean_dataset_file:
_, names, features, targets, timings = pickle.load(clean_dataset_file)
unique_names, unique_features = remove_notunique_features(names, features)
# features were already unique because of create_clean_dataset
# decide where to remove the features
print("create_train_test", timings)
unique_features_filename = find_other_filename("unique_features")
with open(unique_features_filename, 'wb') as unique_features_file:
pickle.dump(unique_features_filename, unique_features_file)
dataset = pickle.load(clean_dataset_file)

###
# Instead of creating dictionaries for features, labels,...abs
# maybe it's better to create a dictionary for each dataset:
# train/test, normal/balanced/augmented
###
x = dict() # to keep the features
y = dict() # to keep the labels
t = dict() # to keep the timings
p = dict() # to keep the projections
# train and test sets are created
random_state = 0
x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
test_size=0.20,
random_state=random_state)

x['train_normal'], x['test_normal'], \
y['train_normal'], y['test_normal'], \
t['train_normal'], t['test_normal'], \
p['train_normal'], p['test_normal'] = \
train_test_split(dataset['features'],
dataset['targets'],
dataset['timings'],
dataset['projections'],
test_size=0.20,
random_state=random_state)
for purpose in ['train', 'test']:
x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
@@ -51,22 +57,28 @@ def create_train_test_datasets():
for method in ['normal', 'balanced', 'augmented']:
this_dataset_filename = find_dataset_filename(usage, method=method)
with open(this_dataset_filename, 'wb') as this_dataset_file:
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file)
if method == 'normal':
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}'], p[f'{usage}_{method}']), this_dataset_file)
else:
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file)

writer.writerow([f'{usage} {method} dataset']
+ [str(count_instances(y[f'{usage}_{method}'], i))
for i in range(6)]
+ [str(len(y[f'{usage}_{method}']))])


def create_regression_datasets():
def create_regression_datasets(taking_logarithms=True):
for usage in ['train', 'test']:
this_dataset_filename = find_dataset_filename(usage,
method='augmented')
# we will use the augmented dataset here
with open(this_dataset_filename, 'rb') as this_dataset_file:
X, Y, T = pickle.load(this_dataset_file)
Y = [log(timings[0]) for timings in T] # remove log here if real times want to be given
if taking_logarithms:
Y = [log(timings[0]) for timings in T]
else:
Y = [timings[0] for timings in T]
this_dataset_filename =\
find_dataset_filename(usage, method='regression')
with open(this_dataset_filename, 'wb') as this_dataset_file:
7 changes: 6 additions & 1 deletion train_models.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,12 @@ def train_model(ml_model, method):
train_data_filename = find_dataset_filename('train', method=method)
hyperparams_file = find_hyperparams_filename(method, ml_model)
with open(train_data_filename, 'rb') as train_data_file:
x_train, y_train, _ = pickle.load(train_data_file)
if method == "Normal":
x_train, y_train, _, _ = pickle.load(train_data_file)
else:
x_train, y_train, _ = pickle.load(train_data_file)
# a = pickle.load(train_data_file)
# print(a[0], type(a), len(a), method)
hyperparams = read_yaml_from_file(hyperparams_file)
current_classifier = sklearn_models[ml_model]
clf = current_classifier(**hyperparams)

0 comments on commit 051d649

Please sign in to comment.