Skip to content

Commit

Permalink
Adding hyperparameter tunning
Browse files Browse the repository at this point in the history
delriot committed Apr 4, 2023
1 parent 9772da2 commit c37433d
Showing 21 changed files with 539 additions and 41 deletions.
Binary file not shown.
106 changes: 106 additions & 0 deletions choose_hyperparams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
The experiments in [1] are replicated with some changes.

The first change is that the testing data is balanced, so that all targets
are almost equally common.
Then we use three training sets; dataset as in [1], balanced dataset
and data augmentation dataset.

[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline
to Pick the Variable Ordering for Algorithms with Polynomial Inputs.
Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds)
Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
"""


import os
import pickle
import csv
import yaml
import importlib.util
from config.ml_models import ml_models
from config.ml_models import classifiers
from config.ml_models import dataset_types
from config.hyperparameters_grid import grid
from sklearn.model_selection import GridSearchCV


def write_yaml_to_file(py_obj, filename):
with open(f'{filename}.yaml', 'w',) as f:
yaml.dump(py_obj, f, sort_keys=False)
print('Written to file successfully')


def k_folds_ml(x_train, y_train, model, random_state=0):
"""
Train the desired model.

The hyperparameters of the models are chosen using 5-fold cross validation.
"""
current_classifier = classifiers[model]
current_grid = grid[model]
rf_cv = GridSearchCV(estimator=current_classifier(),
param_grid=current_grid,
cv=5)
rf_cv.fit(x_train, y_train)
return rf_cv.best_params_


test_balanced_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', 'test',
'balanced_test_dataset.txt')
with open(test_balanced_dataset_file, 'rb') as g:
balanced_x_test, balanced_y_test = pickle.load(g)

test_normal_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', 'test',
'normal_test_dataset.txt')
with open(test_normal_dataset_file, 'rb') as g:
normal_x_test, normal_y_test = pickle.load(g)

output_file_balanced = os.path.join(os.path.dirname(__file__),
'ml_results_k_fold_tested_in_balanced.csv')
with open(output_file_balanced, 'w') as f_balanced:
writer_balanced = csv.writer(f_balanced)
writer_balanced.writerow(["Name"] + dataset_types)
output_file_normal = os.path.join(os.path.dirname(__file__),
'ml_results_k_fold_tested_in_normal.csv')
with open(output_file_normal, 'w') as f_normal:
writer_normal = csv.writer(f_normal)
writer_normal.writerow(["Name"] + dataset_types)
for ml_model in ml_models:
print(f"Model: {ml_model}")
acc_balanced = dict()
acc_normal = dict()
for method in dataset_types:
this_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', 'train',
f'{method}_train_dataset.txt')
with open(this_dataset_file, 'rb') as f:
method_x_train, method_y_train = pickle.load(f)
hyperparams = k_folds_ml(method_x_train, method_y_train,
model=ml_model)
write_yaml_to_file(hyperparams,
os.path.join(os.path.dirname(__file__),
'config', 'hyperparams',
f'{method}_{ml_model}'))
current_classifier = classifiers[ml_model]
clf = current_classifier(**hyperparams)
clf.fit(method_x_train, method_y_train)
acc_balanced[method] = clf.score(balanced_x_test,
balanced_y_test)
acc_normal[method] = clf.score(normal_x_test, normal_y_test)
method_file = os.path.join(os.path.dirname(__file__),
'config', 'models',
f'{method}_trained_model.txt')
with open(method_file, 'wb') as f_method:
pickle.dump(clf, f_method)
round_accuracies_balanced = [round(acc, 2)
for acc in [acc_balanced[method_here]
for method_here in dataset_types]]
round_accuracies_normal = [round(acc, 2)
for acc in [acc_normal[method_here]
for method_here in dataset_types]]
writer_balanced.writerow([ml_model] + round_accuracies_balanced)
writer_normal.writerow([ml_model] + round_accuracies_normal)
26 changes: 26 additions & 0 deletions create_clean_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pickle
import numpy as np
from replicating_Dorians_features import extract_features
from basic_ml import use_tf, basic_ml
from itertools import product
import sys
import os
import csv


dataset_file = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
f = open(dataset_file, 'rb')
dataset = pickle.load(f)
original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)

# working with raw features
features = np.array(features_list)
targets = np.array(targets_list)
timings = np.array(timings_list)
original_polys = np.array(original_polys_list)

clean_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')
g = open(clean_dataset_file, 'wb')
dataset = pickle.dump((original_polys, names, features, targets, timings), g)
Binary file added datasets/clean_dataset.txt
Binary file not shown.
7 changes: 7 additions & 0 deletions datasets/dataset_instances.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
dataset,zero,one,two,three,four,five,total
train normal dataset,326,74,105,41,163,106,815
train balanced dataset,126,113,149,138,144,145,815
train augmented dataset,815,815,815,815,815,815,4890
test normal dataset,80,19,30,10,39,26,204
test balanced dataset,31,34,32,38,34,35,204
test augmented dataset,204,204,204,204,204,204,1224
Binary file added datasets/dataset_without_repetition.txt
Binary file not shown.
Binary file added datasets/test/augmented_test_dataset.txt
Binary file not shown.
Binary file added datasets/test/balanced_test_dataset.txt
Binary file not shown.
Binary file added datasets/test/normal_test_dataset.txt
Binary file not shown.
Binary file added datasets/train/augmented_train_dataset.txt
Binary file not shown.
Binary file added datasets/train/balanced_train_dataset.txt
Binary file not shown.
Binary file added datasets/train/normal_train_dataset.txt
Binary file not shown.
102 changes: 69 additions & 33 deletions main.py
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@
import pickle
import random
import csv
import yaml
import importlib.util
# Check if 'dataset_manipulation' is installed
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
@@ -31,70 +32,105 @@
from packages.dataset_manipulation import balance_dataset
from packages.dataset_manipulation import augmentate_dataset
from sklearn.preprocessing import normalize
from preprocessing_Dorians_features import normalize_features # noqa401
from sklearn.model_selection import train_test_split
from basic_ml import basic_ml
from k_folds_ml import k_folds_ml

def write_yaml_to_file(py_obj,filename):
with open(f'{filename}.yaml', 'w',) as f :
yaml.dump(py_obj,f,sort_keys=False)
print('Written to file successfully')



names_features_targets_file = os.path.join(os.path.dirname(__file__),
'datasets',
'names_features_targets.txt')
'clean_dataset.txt')
with open(names_features_targets_file, 'rb') as f:
names, features, targets = pickle.load(f)
augmented_features, augmented_targets = augmentate_dataset(features, targets)
original_polys, names, features, targets, timings = pickle.load(f)


augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)

normalized_augmented_features = normalize(augmented_features)
# an alternative approach to normalizing
# features = np.transpose(normalize_features(features))
unique_names = name_unique_features(names,
normalized_augmented_features)
augmented_features)

random_state = 0
# Other random states may be tried to check that similar results are achieved
random.seed(random_state)

# Models that will be used are chosen
ml_models = ['SVC', 'DT', 'KNN', 'RF', 'MPL', 'my_mlp']
ml_models = ['KNN', 'DT', 'MLP', 'SVC', 'RF'] # , 'my_mlp'

# train and test sets are created
x_train, x_test, y_train, y_test = train_test_split(features, targets,
x_train, x_test, y_train, y_test, t_train, t_test = train_test_split(features, targets, timings,
test_size=0.20,
random_state=random_state)
# test features are balanced
bal_x_test, bal_y_test = balance_dataset(x_test, y_test)
# and the repeated features are removed before presenting them to any model
bal_x_test, bal_y_test, bal_t_test = balance_dataset(x_test, y_test, t_test)
# and the repeated features are removed before presenting them to any ml_model
# we will ensure that instances send to the models dont have repeated features
unique_bal_x_test = remove_notunique_features(unique_names, names, bal_x_test)
# testing data for all approaches is ready
unique_x_train = remove_notunique_features(unique_names, names, x_train)
# training data without changes ready
bal_x_train, bal_y_train = balance_dataset(x_train, y_train)
bal_x_train, bal_y_train, bal_t_train = balance_dataset(x_train, y_train, t_train)
unique_bal_x_train = remove_notunique_features(unique_names, names, bal_x_train)
# balanced training data ready
aug_x_train, aug_y_train = augmentate_dataset(x_train, y_train)
aug_x_train, aug_y_train, aug_t_train = augmentate_dataset(x_train, y_train, t_train)
unique_aug_x_train = remove_notunique_features(unique_names, names, aug_x_train)
# augmented training data ready

# output_file = os.path.join(os.path.dirname(__file__),
# 'ml_results.csv')
# with open(output_file, 'w') as f:
# writer = csv.writer(f)
# writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
# for ml_model in ml_models:
# acc_basic = basic_ml(unique_x_train, unique_bal_x_test,
# y_train, bal_y_test,
# ml_model, random_state=random_state)

# acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test,
# bal_y_train, bal_y_test,
# ml_model, random_state=random_state)

# acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test,
# aug_y_train, bal_y_test,
# ml_model, random_state=random_state)

# round_accuracies = [round(acc, 2) for acc in [acc_basic,
# acc_bal,
# acc_augmented]]
# writer.writerow([ml_model] + round_accuracies)

# output_file = os.path.join(os.path.dirname(__file__),
# 'ml_results_k_fold.csv')
# with open(output_file, 'w') as f:
# writer = csv.writer(f)
# writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
# print(f"{method}")
# print(f"The accuracies of {ml_model} are:\n Normal: {acc_basic} \n Balanced: {acc_bal}\n Augmented: {acc_augmented}")

# round_accuracies = [round(acc, 2) for acc in [acc_basic,
# acc_bal,
# acc_augmented]]
# writer.writerow([ml_model] + round_accuracies)

output_file = os.path.join(os.path.dirname(__file__),
'ml_results.csv')
with open(output_file, 'w') as f:
writer = csv.writer(f)
writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
for ml_model in ml_models:
acc_basic = basic_ml(unique_x_train, unique_bal_x_test,
y_train, bal_y_test,
ml_model, random_state=random_state)

acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test,
bal_y_train, bal_y_test,
ml_model, random_state=random_state)

acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test,
aug_y_train, bal_y_test,
ml_model, random_state=random_state)

round_accuracies = [round(acc, 2) for acc in [acc_basic,
acc_bal,
acc_augmented]]
writer.writerow([ml_model] + round_accuracies)
x_and_y_per_method = dict()
x_and_y_per_method['basic'] = (unique_x_train, y_train)
x_and_y_per_method['balanced'] = (unique_bal_x_train, bal_y_train)
x_and_y_per_method['augmented'] = (unique_aug_x_train, aug_y_train)
for ml_model in ml_models:
print(f"Model: {ml_model}")
for method in ['basic', 'balanced', 'augmented']:
method_x_train, method_y_train = x_and_y_per_method[method]
hyperparams = k_folds_ml(method_x_train, method_y_train,
model=ml_model)
write_yaml_to_file(hyperparams,
f'UsingDorianFeatures\\config\\hyperparams\\{method}_{ml_model}')
for train_data in ['basic', 'balanced']:
clf = ml_model()
4 changes: 2 additions & 2 deletions ml_results.csv
Original file line number Diff line number Diff line change
@@ -2,6 +2,6 @@ Name,Normal,Balance data,Augment data
SVC,0.21,0.21,0.16
DT,0.32,0.27,0.34
KNN,0.29,0.39,0.51
RF,0.38,0.49,0.55
RF,0.42,0.5,0.61
MPL,0.21,0.2,0.17
my_mlp,0.28,0.32,0.36
my_mlp,0.36,0.33,0.38
4 changes: 4 additions & 0 deletions packages/build/lib/dataset_manipulation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .dataset_manipulation import augmentate_dataset # noqa401
from .dataset_manipulation import balance_dataset # noqa401
from .dataset_manipulation import name_unique_features # noqa401
from .dataset_manipulation import remove_notunique_features # noqa401
72 changes: 72 additions & 0 deletions packages/build/lib/dataset_manipulation/dataset_manipulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Exploit symmetries in polynomials to augmentate or balance the dataset."""
import numpy as np
import math
import random
from .exploit_symmetries import give_all_symmetries

nvar = 3


def augmentate_dataset(features, targets):
"""
Multiply the size of the dataset by 6.

Arguments:
features: list(list(numpy.float))
targets: list(numpy.float)
"""
symmetric_features = []
symmetric_targets = []
for features, target in zip(features, targets):
symmetric_features += give_all_symmetries(features, int(target))
symmetric_targets += list(range(math.factorial(nvar)))
return np.array(symmetric_features), np.array(symmetric_targets)


def balance_dataset(features, targets):
"""
Balance the dataset so all targets are almost equally common.

Arguments:
features: list(list(numpy.float))
targets: list(numpy.float)
"""
balanced_features = []
balanced_targets = []
for features, target in zip(features, targets):
symmetric_features = give_all_symmetries(features, int(target))
possible_targets = list(range(math.factorial(nvar)))
new_target = random.choice(possible_targets)
balanced_features.append(symmetric_features[new_target])
balanced_targets.append(new_target)
return np.array(balanced_features), np.array(balanced_targets)


def name_unique_features(names, features):
"""
Return the name of unique features.

When two features share the same value for all the instances
one of them is not considered unique.
"""
new_features = []
new_names = []
rep = 0
for index, feature in enumerate(zip(*features)):
if (any([np.array_equal(feature, ex_feature)
for ex_feature in new_features])
or np.std(feature) == 0):
rep += 1
else:
new_features.append(feature)
new_names.append(names[index])
return new_names


def remove_notunique_features(unique_names, names, features):
"""Return the features corresponding to a name in 'unique_names'."""
unique_features = []
for index, feature in enumerate(zip(*features)):
if names[index] in unique_names:
unique_features.append(feature)
return np.transpose(unique_features)
51 changes: 51 additions & 0 deletions packages/build/lib/dataset_manipulation/exploit_symmetries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Exploit symmetries in three variable polynomials to generate up to six
instances out of each existing one.

The task at hand consists in classify this features.
We will take advantage of the fact that we can change the target by
reordering the features.

This file will contain:
- features_to_canonical_ordering: a function able to reorder the features so
that the target becomes '1', this ordering is called the canonical order.
- give_all_symmetries: a function that given the canonical order returns the
reorderings for each of the possible targets '1','2',...,'6'.
"""
from itertools import permutations

nvar = 3
variables = list(range(nvar))
perms = [list(elem) for elem in permutations(variables)]


def features_to_canonical_target(features, optimal_ordering):
"""
Reorder the features for the target to be '1'.

This is done by reordering the features according to the optimal variable
ordering of the set of polynomials.
"""
variable_orderings = perms[optimal_ordering]
nfeatures = len(features)
split_features = [features[int(var*nfeatures/nvar):
int((var+1)*nfeatures/nvar)]
for var in range(nvar)]
ordered_features = [split_features[variable_orderings[i]]
for i in range(nvar)]
return ordered_features


def give_all_symmetries(features, optimal_ordering):
"""Reorder the features for all possible targets."""
ordered_features = features_to_canonical_target(features,
optimal_ordering)
all_symmetries = []
for perm in perms:
new_order_features = [0]*nvar
for index, var in enumerate(perm):
new_order_features[var] = ordered_features[index]
flatten_new_order_features = [elem for lst in new_order_features
for elem in lst]
all_symmetries.append(flatten_new_order_features)
return all_symmetries
22 changes: 16 additions & 6 deletions packages/dataset_manipulation/dataset_manipulation.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@
nvar = 3


def augmentate_dataset(features, targets):
def augmentate_dataset(features, targets, timings):
"""
Multiply the size of the dataset by 6.

@@ -17,13 +17,15 @@ def augmentate_dataset(features, targets):
"""
symmetric_features = []
symmetric_targets = []
for features, target in zip(features, targets):
symmetric_timings = []
for features, target, timing in zip(features, targets, timings):
symmetric_features += give_all_symmetries(features, int(target))
symmetric_targets += list(range(math.factorial(nvar)))
return np.array(symmetric_features), np.array(symmetric_targets)
symmetric_timings += list(timing)
return np.array(symmetric_features), np.array(symmetric_targets), np.array(symmetric_timings)


def balance_dataset(features, targets):
def balance_dataset(features, targets, timings):
"""
Balance the dataset so all targets are almost equally common.

@@ -33,13 +35,15 @@ def balance_dataset(features, targets):
"""
balanced_features = []
balanced_targets = []
for features, target in zip(features, targets):
balanced_timings = []
for features, target, timing in zip(features, targets, timings):
symmetric_features = give_all_symmetries(features, int(target))
possible_targets = list(range(math.factorial(nvar)))
new_target = random.choice(possible_targets)
balanced_features.append(symmetric_features[new_target])
balanced_targets.append(new_target)
return np.array(balanced_features), np.array(balanced_targets)
balanced_timings.append(timing[new_target])
return np.array(balanced_features), np.array(balanced_targets), np.array(balanced_timings)


def name_unique_features(names, features):
@@ -57,7 +61,13 @@ def name_unique_features(names, features):
for ex_feature in new_features])
or np.std(feature) == 0):
rep += 1
elif feature.count(feature[0])==len(feature):
print(names[index])
else:
# if 'max_in_polys_max_sig'==names[index][:20]:
# print("Check ", feature.count(feature[0])==len(feature))
# print(names[index])
# print(len(feature))
new_features.append(feature)
new_names.append(names[index])
return new_names
71 changes: 71 additions & 0 deletions replicating_Dorians_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@

import itertools
from xml.sax.handler import all_features
import numpy as np

nvar=3




def aveg(given_list):
return sum(given_list)/len(given_list)

def aveg_not_zero(given_list):
return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0]))

def identity(input):
return input

def sign(input):
if type(input)==list:
return [sign(elem) for elem in input]
else:
if input>0:
return 1
elif input<0:
return -1
elif input==0:
return 0
else:
raise Exception("How is this possible?")


def create_features(degrees, variable=0, sv=False):
functions = [sum, max, aveg, aveg_not_zero]
sign_or_not = [identity, sign]
features = []
features_names = []
for choice in itertools.product(functions, sign_or_not, functions, sign_or_not):
feature_description = choice[0].__name__+"sign"*(choice[1].__name__=="sign")+"_in_polys_"+choice[2].__name__+"_"+"sign"*(choice[3].__name__=="sign")+"of_" + "sum_of_"*sv+"degrees_of_var_"+str(variable)+"_in_monomials"
feature_value = choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) for degrees_in_poly in degrees]))
features.append(feature_value)
features_names.append(feature_description)
return features, features_names


def extract_features(dataset):
all_features = []
all_targets = []
all_timings = []
all_original_polynomials = []
for index, all_projections in enumerate(dataset[0]):
original_polynomials = all_projections[0][0]
all_original_polynomials.append(original_polynomials)
names = []
instance_features = []
all_targets.append(dataset[1][index])
all_timings.append(dataset[2][index])
for var in range(nvar):
degrees = [[monomial[var] for monomial in poly]
for poly in original_polynomials]
var_features, var_features_names = create_features(degrees,
variable=var)
instance_features += var_features
names += var_features_names
sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials]
svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True)
instance_features += svar_features
names += svar_features_names
all_features.append(instance_features)
return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings)
100 changes: 100 additions & 0 deletions test_train_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
The experiments in [1] are replicated with some changes.

The first change is that the testing data is balanced, so that all targets
are almost equally common.
Then we use three training sets; dataset as in [1], balanced dataset
and data augmentation dataset.

[1]Florescu, D., England, M. (2020). A Machine Learning Based Software Pipeline
to Pick the Variable Ordering for Algorithms with Polynomial Inputs.
Bigatti, A., Carette, J., Davenport, J., Joswig, M., de Wolff, T. (eds)
Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
"""


import os
import pickle
import random
import csv
import yaml
import importlib.util
# Check if 'dataset_manipulation' is installed
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
from dataset_manipulation import name_unique_features
from dataset_manipulation import remove_notunique_features
from dataset_manipulation import balance_dataset
from dataset_manipulation import augmentate_dataset
else:
from packages.dataset_manipulation import name_unique_features
from packages.dataset_manipulation import remove_notunique_features
from packages.dataset_manipulation import balance_dataset
from packages.dataset_manipulation import augmentate_dataset
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


def count_instances(my_dataset, instance):
return sum(my_dataset==instance)


names_features_targets_file = os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')
with open(names_features_targets_file, 'rb') as f:
original_polys, names, features, targets, timings = pickle.load(f)

augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)

normalized_augmented_features = normalize(augmented_features)
unique_names = name_unique_features(names,
augmented_features)

random_state = 0

x = dict() # to keep the features
y = dict() # to keep the labels
t = dict() # to keep the timings
# train and test sets are created
not_unique_x_normal_train, not_unique_x_normal_test, y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(features, targets, timings,
test_size=0.20,
random_state=random_state)

not_unique_balanced_x_test, y['test_balanced'], t['test_balanced'] = balance_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
x['test_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_test)
# testing data for all approaches is ready
# all tests will be done in balanced but the others are also computed
not_unique_augmented_x_test, y['test_augmented'], t['test_augmented'] = augmentate_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
x['test_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_test)
x['test_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_test)

x['train_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_train)
# normal training data ready
not_unique_balanced_x_train, y['train_balanced'], t['train_balanced'] = balance_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
x['train_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_train)
# balanced training data ready
not_unique_augmented_x_train, y['train_augmented'], t['train_augmented'] = augmentate_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
x['train_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_train)
# augmented training data ready


dataset_info_file = os.path.join(os.path.dirname(__file__),
'datasets',
'dataset_instances.csv')
with open(dataset_info_file, 'w') as f_dataset_info:
writer = csv.writer(f_dataset_info)
writer.writerow(['dataset'] + ['zero','one','two','three','four','five','total'])
for usage in ['train', 'test']:
for method in ['normal', 'balanced', 'augmented']:
print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
this_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', usage,
f'{method}_{usage}_dataset.txt')
with open(this_dataset_file, 'wb') as f:
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)

writer.writerow([f'{usage} {method} dataset']
+ [str(count_instances(y[f'{usage}_{method}'], i))
for i in range(6)]
+ [str(len(y[f'{usage}_{method}']))])
15 changes: 15 additions & 0 deletions train_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import yaml
from yaml import UnsafeLoader
import os
from config.ml_models import ml_models
from config.ml_models import dataset_types

print(ml_models)
for ml_model in ml_models:
for method in dataset_types:
filename = os.path.join(os.path.dirname(__file__),
'config', 'hyperparams',
f'{method}_{ml_model}.yaml')
with open(filename, 'r') as f:
hyperparameters = yaml.load(f, Loader=UnsafeLoader)
print(type(hyperparameters), hyperparameters)

0 comments on commit c37433d

Please sign in to comment.