Skip to content

Commit

Permalink
Now the main function is very clear and runs everything
Browse files Browse the repository at this point in the history
delriot committed Apr 6, 2023
1 parent 3300703 commit f83a2ce
Showing 12 changed files with 71 additions and 173 deletions.
17 changes: 15 additions & 2 deletions choose_hyperparams.py
Original file line number Diff line number Diff line change
@@ -17,13 +17,15 @@
import os
import pickle
import csv
import importlib.util
from config.ml_models import ml_models
from config.ml_models import classifiers
from config.ml_models import dataset_types
from config.hyperparameters_grid import grid
from sklearn.model_selection import GridSearchCV
from yaml_tools import read_yaml_from_file
from yaml_tools import write_yaml_to_file
from find_filename import find_dataset_filename
from find_filename import find_hyperparams_filename


def k_folds_ml(x_train, y_train, model, random_state=0):
"""
@@ -40,6 +42,17 @@ def k_folds_ml(x_train, y_train, model, random_state=0):
return rf_cv.best_params_


def choose_hyperparams(ml_model, method):
"""Given a ml_model and a method, a file with the hyperparameters
chosen by cross validation is created"""
this_dataset_file = find_dataset_filename('train', method=method)
with open(this_dataset_file, 'rb') as f:
method_x_train, method_y_train = pickle.load(f)
hyperparams = k_folds_ml(method_x_train, method_y_train, model=ml_model)
hyperparams_filename = find_hyperparams_filename(method, ml_model)
write_yaml_to_file(hyperparams, hyperparams_filename)


test_balanced_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', 'test',
'balanced_test_dataset.txt')
29 changes: 12 additions & 17 deletions create_clean_dataset.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,11 @@
import pickle
import numpy as np
from replicating_Dorians_features import extract_features
import sys
import os
import importlib
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
from dataset_manipulation import name_unique_features
from dataset_manipulation import remove_notunique_features
from dataset_manipulation import balance_dataset
from dataset_manipulation import augmentate_dataset
else:
from packages.dataset_manipulation import name_unique_features
from packages.dataset_manipulation import remove_notunique_features
from packages.dataset_manipulation import balance_dataset
from packages.dataset_manipulation import augmentate_dataset


dataset_filename = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
clean_dataset_filename = os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')


def cleaning_dataset(dataset_filename, clean_dataset_filename):
@@ -35,7 +21,16 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename):
timings = np.array(timings_list)
original_polys = np.array(original_polys_list)

with open(clean_dataset_filename, 'wb') as g:
dataset = pickle.dump((original_polys, unique_names, unique_features, targets, timings), g)
with open(clean_dataset_filename, 'wb') as clean_dataset_file:
dataset = pickle.dump((original_polys, unique_names,
unique_features, targets, timings),
clean_dataset_file)


cleaning_dataset(dataset_filename, clean_dataset_filename)
# dataset_filename = os.path.join(os.path.dirname(__file__),
# 'DatasetsBeforeProcessing',
# 'dataset_without_repetition_return_ncells.txt')
# clean_dataset_filename = os.path.join(os.path.dirname(__file__),
# 'datasets',
# 'clean_dataset.txt')
# cleaning_dataset(dataset_filename, clean_dataset_filename)
4 changes: 2 additions & 2 deletions datasets/dataset_instances.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dataset,zero,one,two,three,four,five,total
train normal dataset,326,74,105,41,163,106,815
train balanced dataset,151,121,136,152,133,122,815
train balanced dataset,130,120,135,143,135,152,815
train augmented dataset,815,815,815,815,815,815,4890
test normal dataset,80,19,30,10,39,26,204
test balanced dataset,29,27,32,48,34,34,204
test balanced dataset,34,31,32,37,39,31,204
test augmented dataset,204,204,204,204,204,204,1224
Binary file modified datasets/test/augmented_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/balanced_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/normal_test_dataset.txt
Binary file not shown.
Binary file modified datasets/train/augmented_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/balanced_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/normal_train_dataset.txt
Binary file not shown.
137 changes: 19 additions & 118 deletions main.py
Original file line number Diff line number Diff line change
@@ -12,125 +12,26 @@
Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
"""
from config.ml_models import ml_models
from config.ml_models import dataset_types
from find_filename import find_dataset_filename
from create_clean_dataset import cleaning_dataset
from test_train_datasets import create_train_test_datasets
from choose_hyperparams import choose_hyperparams
from train_models import train_model
from test_models import test_results


import os
import pickle
import random
import csv
import yaml
import importlib.util
# Check if 'dataset_manipulation' is installed
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
from dataset_manipulation import name_unique_features
from dataset_manipulation import remove_notunique_features
from dataset_manipulation import balance_dataset
from dataset_manipulation import augmentate_dataset
else:
from packages.dataset_manipulation import name_unique_features
from packages.dataset_manipulation import remove_notunique_features
from packages.dataset_manipulation import balance_dataset
from packages.dataset_manipulation import augmentate_dataset
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from basic_ml import basic_ml
from k_folds_ml import k_folds_ml
original_dataset_file = find_dataset_filename('unclean')
clean_dataset_filename = find_dataset_filename('clean')
cleaning_dataset(original_dataset_file, clean_dataset_filename)
create_train_test_datasets()

def write_yaml_to_file(py_obj,filename):
with open(f'{filename}.yaml', 'w',) as f :
yaml.dump(py_obj,f,sort_keys=False)
print('Written to file successfully')



names_features_targets_file = os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')
with open(names_features_targets_file, 'rb') as f:
original_polys, names, features, targets, timings = pickle.load(f)


augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)

normalized_augmented_features = normalize(augmented_features)
# an alternative approach to normalizing
# features = np.transpose(normalize_features(features))
unique_names = name_unique_features(names,
augmented_features)

random_state = 0
# Other random states may be tried to check that similar results are achieved
random.seed(random_state)

# Models that will be used are chosen
ml_models = ['KNN', 'DT', 'MLP', 'SVC', 'RF'] # , 'my_mlp'

# train and test sets are created
x_train, x_test, y_train, y_test, t_train, t_test = train_test_split(features, targets, timings,
test_size=0.20,
random_state=random_state)
# test features are balanced
bal_x_test, bal_y_test, bal_t_test = balance_dataset(x_test, y_test, t_test)
# and the repeated features are removed before presenting them to any ml_model
# we will ensure that instances send to the models dont have repeated features
unique_bal_x_test = remove_notunique_features(unique_names, names, bal_x_test)
# testing data for all approaches is ready
unique_x_train = remove_notunique_features(unique_names, names, x_train)
# training data without changes ready
bal_x_train, bal_y_train, bal_t_train = balance_dataset(x_train, y_train, t_train)
unique_bal_x_train = remove_notunique_features(unique_names, names, bal_x_train)
# balanced training data ready
aug_x_train, aug_y_train, aug_t_train = augmentate_dataset(x_train, y_train, t_train)
unique_aug_x_train = remove_notunique_features(unique_names, names, aug_x_train)
# augmented training data ready

# output_file = os.path.join(os.path.dirname(__file__),
# 'ml_results.csv')
# with open(output_file, 'w') as f:
# writer = csv.writer(f)
# writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
# for ml_model in ml_models:
# acc_basic = basic_ml(unique_x_train, unique_bal_x_test,
# y_train, bal_y_test,
# ml_model, random_state=random_state)

# acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test,
# bal_y_train, bal_y_test,
# ml_model, random_state=random_state)

# acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test,
# aug_y_train, bal_y_test,
# ml_model, random_state=random_state)

# round_accuracies = [round(acc, 2) for acc in [acc_basic,
# acc_bal,
# acc_augmented]]
# writer.writerow([ml_model] + round_accuracies)

# output_file = os.path.join(os.path.dirname(__file__),
# 'ml_results_k_fold.csv')
# with open(output_file, 'w') as f:
# writer = csv.writer(f)
# writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
# print(f"{method}")
# print(f"The accuracies of {ml_model} are:\n Normal: {acc_basic} \n Balanced: {acc_bal}\n Augmented: {acc_augmented}")

# round_accuracies = [round(acc, 2) for acc in [acc_basic,
# acc_bal,
# acc_augmented]]
# writer.writerow([ml_model] + round_accuracies)

x_and_y_per_method = dict()
x_and_y_per_method['basic'] = (unique_x_train, y_train)
x_and_y_per_method['balanced'] = (unique_bal_x_train, bal_y_train)
x_and_y_per_method['augmented'] = (unique_aug_x_train, aug_y_train)
for ml_model in ml_models:
print(f"Model: {ml_model}")
for method in ['basic', 'balanced', 'augmented']:
method_x_train, method_y_train = x_and_y_per_method[method]
hyperparams = k_folds_ml(method_x_train, method_y_train,
model=ml_model)
write_yaml_to_file(hyperparams,
f'UsingDorianFeatures\\config\\hyperparams\\{method}_{ml_model}')
for train_data in ['basic', 'balanced']:
clf = ml_model()
for method in dataset_types:
choose_hyperparams(ml_model, method)
for ml_model in ml_models:
for method in dataset_types:
train_model(ml_model, method)
for testing_method in ['normal', 'balanced']:
test_results(testing_method)
27 changes: 9 additions & 18 deletions test_train_datasets.py
Original file line number Diff line number Diff line change
@@ -16,57 +16,51 @@

import os
import pickle
import random
import csv
import yaml
import importlib.util
# Check if 'dataset_manipulation' is installed
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
from dataset_manipulation import name_unique_features
from dataset_manipulation import remove_notunique_features
from dataset_manipulation import balance_dataset
from dataset_manipulation import augmentate_dataset
else:
from packages.dataset_manipulation import name_unique_features
from packages.dataset_manipulation import remove_notunique_features
from packages.dataset_manipulation import balance_dataset
from packages.dataset_manipulation import augmentate_dataset
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from find_filename import find_dataset_filename


def count_instances(my_dataset, instance):
return sum(my_dataset == instance)


def create_train_test_datasets(clean_dataset_filename):
def create_train_test_datasets():
clean_dataset_filename = find_dataset_filename('clean')
with open(clean_dataset_filename, 'rb') as clean_dataset_file:
_, names, features, targets, timings = pickle.load(clean_dataset_file)
unique_names, unique_features = remove_notunique_features(names, features)

x = dict() # to keep the features
y = dict() # to keep the labels
t = dict() # to keep the timings
# train and test sets are created
random_state = 0
x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
test_size=0.20,
random_state=random_state)
for purpose in ['train', 'test']:
x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])


dataset_info_file = os.path.join(os.path.dirname(__file__),
'datasets',
'dataset_instances.csv')
dataset_info_file = find_dataset_filename('instances')
with open(dataset_info_file, 'w') as f_dataset_info:
writer = csv.writer(f_dataset_info)
writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total'])
for usage in ['train', 'test']:
for method in ['normal', 'balanced', 'augmented']:
print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
this_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', usage,
f'{method}_{usage}_dataset.txt')
'datasets', usage,
f'{method}_{usage}_dataset.txt')
with open(this_dataset_file, 'wb') as f:
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)

@@ -76,7 +70,4 @@ def create_train_test_datasets(clean_dataset_filename):
+ [str(len(y[f'{usage}_{method}']))])


# clean_dataset_filename = os.path.join(os.path.dirname(__file__),
# 'datasets',
# 'clean_dataset.txt')
# create_train_test_datasets(clean_dataset_filename)
# create_train_test_datasets()
30 changes: 14 additions & 16 deletions train_models.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
import os
import pickle
from yaml_tools import read_yaml_from_file
from config.ml_models import classifiers
from find_filename import find_dataset_filename
from find_filename import find_hyperparams_filename
from find_filename import find_model_filename


def train_model(ml_model, method):
train_data_file = os.path.join(os.path.dirname(__file__),
'datasets', 'train',
f'{method}_train_dataset.txt')
hyperparams_file = os.path.join(os.path.dirname(__file__),
'config', 'hyperparams',
f'{method}_{ml_model}')
with open(train_data_file, 'rb') as f:
method_x_train, method_y_train = pickle.load(f)
hyperparams = read_yaml_from_file(hyperparams_file)
current_classifier = classifiers[ml_model]
clf = current_classifier(**hyperparams)
clf.fit(method_x_train, method_y_train)


# print(train_model(ml_models[1], dataset_types[0]))
train_data_filename = find_dataset_filename('train', method=method)
hyperparams_file = find_hyperparams_filename(method, ml_model)
with open(train_data_filename, 'rb') as train_data_file:
x_train, y_train = pickle.load(train_data_file)
hyperparams = read_yaml_from_file(hyperparams_file)
current_classifier = classifiers[ml_model]
clf = current_classifier(**hyperparams)
clf.fit(x_train, y_train)
trained_model_filename = find_model_filename(method, ml_model)
with open(trained_model_filename, 'wb') as trained_model_file:
pickle.dump(clf, trained_model_file)

0 comments on commit f83a2ce

Please sign in to comment.