diff --git a/choose_hyperparams.py b/choose_hyperparams.py index 6cc9d65..4e3c7da 100644 --- a/choose_hyperparams.py +++ b/choose_hyperparams.py @@ -17,13 +17,15 @@ import os import pickle import csv -import importlib.util from config.ml_models import ml_models from config.ml_models import classifiers from config.ml_models import dataset_types from config.hyperparameters_grid import grid from sklearn.model_selection import GridSearchCV -from yaml_tools import read_yaml_from_file +from yaml_tools import write_yaml_to_file +from find_filename import find_dataset_filename +from find_filename import find_hyperparams_filename + def k_folds_ml(x_train, y_train, model, random_state=0): """ @@ -40,6 +42,17 @@ def k_folds_ml(x_train, y_train, model, random_state=0): return rf_cv.best_params_ +def choose_hyperparams(ml_model, method): + """Given a ml_model and a method, a file with the hyperparameters + chosen by cross validation is created""" + this_dataset_file = find_dataset_filename('train', method=method) + with open(this_dataset_file, 'rb') as f: + method_x_train, method_y_train = pickle.load(f) + hyperparams = k_folds_ml(method_x_train, method_y_train, model=ml_model) + hyperparams_filename = find_hyperparams_filename(method, ml_model) + write_yaml_to_file(hyperparams, hyperparams_filename) + + test_balanced_dataset_file = os.path.join(os.path.dirname(__file__), 'datasets', 'test', 'balanced_test_dataset.txt') diff --git a/create_clean_dataset.py b/create_clean_dataset.py index a43b707..c827fa4 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -1,25 +1,11 @@ import pickle import numpy as np from replicating_Dorians_features import extract_features -import sys -import os import importlib if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): - from dataset_manipulation import name_unique_features from dataset_manipulation import remove_notunique_features - from dataset_manipulation import balance_dataset - from dataset_manipulation import augmentate_dataset else: - from packages.dataset_manipulation import name_unique_features from packages.dataset_manipulation import remove_notunique_features - from packages.dataset_manipulation import balance_dataset - from packages.dataset_manipulation import augmentate_dataset - - -dataset_filename = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt') -clean_dataset_filename = os.path.join(os.path.dirname(__file__), - 'datasets', - 'clean_dataset.txt') def cleaning_dataset(dataset_filename, clean_dataset_filename): @@ -35,7 +21,16 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename): timings = np.array(timings_list) original_polys = np.array(original_polys_list) - with open(clean_dataset_filename, 'wb') as g: - dataset = pickle.dump((original_polys, unique_names, unique_features, targets, timings), g) + with open(clean_dataset_filename, 'wb') as clean_dataset_file: + dataset = pickle.dump((original_polys, unique_names, + unique_features, targets, timings), + clean_dataset_file) + -cleaning_dataset(dataset_filename, clean_dataset_filename) +# dataset_filename = os.path.join(os.path.dirname(__file__), +# 'DatasetsBeforeProcessing', +# 'dataset_without_repetition_return_ncells.txt') +# clean_dataset_filename = os.path.join(os.path.dirname(__file__), +# 'datasets', +# 'clean_dataset.txt') +# cleaning_dataset(dataset_filename, clean_dataset_filename) diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv index d23ccbb..a4eeeff 100644 --- a/datasets/dataset_instances.csv +++ b/datasets/dataset_instances.csv @@ -1,7 +1,7 @@ dataset,zero,one,two,three,four,five,total train normal dataset,326,74,105,41,163,106,815 -train balanced dataset,151,121,136,152,133,122,815 +train balanced dataset,130,120,135,143,135,152,815 train augmented dataset,815,815,815,815,815,815,4890 test normal dataset,80,19,30,10,39,26,204 -test balanced dataset,29,27,32,48,34,34,204 +test balanced dataset,34,31,32,37,39,31,204 test augmented dataset,204,204,204,204,204,204,1224 diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index db064a4..58e811f 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index 1684129..c0686fb 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt index f3ae7f2..90a466f 100644 Binary files a/datasets/test/normal_test_dataset.txt and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index 754502e..c40cd59 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 6ccb005..34573c3 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt index 9b7f9af..4ef767e 100644 Binary files a/datasets/train/normal_train_dataset.txt and b/datasets/train/normal_train_dataset.txt differ diff --git a/main.py b/main.py index 71bae95..1a2eca1 100644 --- a/main.py +++ b/main.py @@ -12,125 +12,26 @@ Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 """ +from config.ml_models import ml_models +from config.ml_models import dataset_types +from find_filename import find_dataset_filename +from create_clean_dataset import cleaning_dataset +from test_train_datasets import create_train_test_datasets +from choose_hyperparams import choose_hyperparams +from train_models import train_model +from test_models import test_results -import os -import pickle -import random -import csv -import yaml -import importlib.util -# Check if 'dataset_manipulation' is installed -if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): - from dataset_manipulation import name_unique_features - from dataset_manipulation import remove_notunique_features - from dataset_manipulation import balance_dataset - from dataset_manipulation import augmentate_dataset -else: - from packages.dataset_manipulation import name_unique_features - from packages.dataset_manipulation import remove_notunique_features - from packages.dataset_manipulation import balance_dataset - from packages.dataset_manipulation import augmentate_dataset -from sklearn.preprocessing import normalize -from sklearn.model_selection import train_test_split -from basic_ml import basic_ml -from k_folds_ml import k_folds_ml +original_dataset_file = find_dataset_filename('unclean') +clean_dataset_filename = find_dataset_filename('clean') +cleaning_dataset(original_dataset_file, clean_dataset_filename) +create_train_test_datasets() -def write_yaml_to_file(py_obj,filename): - with open(f'{filename}.yaml', 'w',) as f : - yaml.dump(py_obj,f,sort_keys=False) - print('Written to file successfully') - - - -names_features_targets_file = os.path.join(os.path.dirname(__file__), - 'datasets', - 'clean_dataset.txt') -with open(names_features_targets_file, 'rb') as f: - original_polys, names, features, targets, timings = pickle.load(f) - - -augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings) - -normalized_augmented_features = normalize(augmented_features) -# an alternative approach to normalizing -# features = np.transpose(normalize_features(features)) -unique_names = name_unique_features(names, - augmented_features) - -random_state = 0 -# Other random states may be tried to check that similar results are achieved -random.seed(random_state) - -# Models that will be used are chosen -ml_models = ['KNN', 'DT', 'MLP', 'SVC', 'RF'] # , 'my_mlp' - -# train and test sets are created -x_train, x_test, y_train, y_test, t_train, t_test = train_test_split(features, targets, timings, - test_size=0.20, - random_state=random_state) -# test features are balanced -bal_x_test, bal_y_test, bal_t_test = balance_dataset(x_test, y_test, t_test) -# and the repeated features are removed before presenting them to any ml_model -# we will ensure that instances send to the models dont have repeated features -unique_bal_x_test = remove_notunique_features(unique_names, names, bal_x_test) -# testing data for all approaches is ready -unique_x_train = remove_notunique_features(unique_names, names, x_train) -# training data without changes ready -bal_x_train, bal_y_train, bal_t_train = balance_dataset(x_train, y_train, t_train) -unique_bal_x_train = remove_notunique_features(unique_names, names, bal_x_train) -# balanced training data ready -aug_x_train, aug_y_train, aug_t_train = augmentate_dataset(x_train, y_train, t_train) -unique_aug_x_train = remove_notunique_features(unique_names, names, aug_x_train) -# augmented training data ready - -# output_file = os.path.join(os.path.dirname(__file__), -# 'ml_results.csv') -# with open(output_file, 'w') as f: -# writer = csv.writer(f) -# writer.writerow(["Name", "Normal", "Balance data", "Augment data"]) -# for ml_model in ml_models: -# acc_basic = basic_ml(unique_x_train, unique_bal_x_test, -# y_train, bal_y_test, -# ml_model, random_state=random_state) - -# acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test, -# bal_y_train, bal_y_test, -# ml_model, random_state=random_state) - -# acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test, -# aug_y_train, bal_y_test, -# ml_model, random_state=random_state) - -# round_accuracies = [round(acc, 2) for acc in [acc_basic, -# acc_bal, -# acc_augmented]] -# writer.writerow([ml_model] + round_accuracies) - -# output_file = os.path.join(os.path.dirname(__file__), -# 'ml_results_k_fold.csv') -# with open(output_file, 'w') as f: -# writer = csv.writer(f) -# writer.writerow(["Name", "Normal", "Balance data", "Augment data"]) -# print(f"{method}") - # print(f"The accuracies of {ml_model} are:\n Normal: {acc_basic} \n Balanced: {acc_bal}\n Augmented: {acc_augmented}") - - # round_accuracies = [round(acc, 2) for acc in [acc_basic, - # acc_bal, - # acc_augmented]] - # writer.writerow([ml_model] + round_accuracies) - -x_and_y_per_method = dict() -x_and_y_per_method['basic'] = (unique_x_train, y_train) -x_and_y_per_method['balanced'] = (unique_bal_x_train, bal_y_train) -x_and_y_per_method['augmented'] = (unique_aug_x_train, aug_y_train) for ml_model in ml_models: - print(f"Model: {ml_model}") - for method in ['basic', 'balanced', 'augmented']: - method_x_train, method_y_train = x_and_y_per_method[method] - hyperparams = k_folds_ml(method_x_train, method_y_train, - model=ml_model) - write_yaml_to_file(hyperparams, - f'UsingDorianFeatures\\config\\hyperparams\\{method}_{ml_model}') - for train_data in ['basic', 'balanced']: - clf = ml_model() \ No newline at end of file + for method in dataset_types: + choose_hyperparams(ml_model, method) +for ml_model in ml_models: + for method in dataset_types: + train_model(ml_model, method) +for testing_method in ['normal', 'balanced']: + test_results(testing_method) diff --git a/test_train_datasets.py b/test_train_datasets.py index 5f3ea7b..53d60d3 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -16,57 +16,51 @@ import os import pickle -import random import csv -import yaml import importlib.util # Check if 'dataset_manipulation' is installed if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): - from dataset_manipulation import name_unique_features from dataset_manipulation import remove_notunique_features from dataset_manipulation import balance_dataset from dataset_manipulation import augmentate_dataset else: - from packages.dataset_manipulation import name_unique_features from packages.dataset_manipulation import remove_notunique_features from packages.dataset_manipulation import balance_dataset from packages.dataset_manipulation import augmentate_dataset -from sklearn.preprocessing import normalize from sklearn.model_selection import train_test_split +from find_filename import find_dataset_filename def count_instances(my_dataset, instance): return sum(my_dataset == instance) -def create_train_test_datasets(clean_dataset_filename): +def create_train_test_datasets(): + clean_dataset_filename = find_dataset_filename('clean') with open(clean_dataset_filename, 'rb') as clean_dataset_file: _, names, features, targets, timings = pickle.load(clean_dataset_file) + unique_names, unique_features = remove_notunique_features(names, features) x = dict() # to keep the features y = dict() # to keep the labels t = dict() # to keep the timings # train and test sets are created + random_state = 0 x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings, test_size=0.20, random_state=random_state) for purpose in ['train', 'test']: x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) - - - dataset_info_file = os.path.join(os.path.dirname(__file__), - 'datasets', - 'dataset_instances.csv') + dataset_info_file = find_dataset_filename('instances') with open(dataset_info_file, 'w') as f_dataset_info: writer = csv.writer(f_dataset_info) writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total']) for usage in ['train', 'test']: for method in ['normal', 'balanced', 'augmented']: - print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}'])) this_dataset_file = os.path.join(os.path.dirname(__file__), - 'datasets', usage, - f'{method}_{usage}_dataset.txt') + 'datasets', usage, + f'{method}_{usage}_dataset.txt') with open(this_dataset_file, 'wb') as f: pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f) @@ -76,7 +70,4 @@ def create_train_test_datasets(clean_dataset_filename): + [str(len(y[f'{usage}_{method}']))]) -# clean_dataset_filename = os.path.join(os.path.dirname(__file__), -# 'datasets', -# 'clean_dataset.txt') -# create_train_test_datasets(clean_dataset_filename) +# create_train_test_datasets() diff --git a/train_models.py b/train_models.py index 86630ac..0576dcd 100644 --- a/train_models.py +++ b/train_models.py @@ -1,22 +1,20 @@ -import os import pickle from yaml_tools import read_yaml_from_file from config.ml_models import classifiers +from find_filename import find_dataset_filename +from find_filename import find_hyperparams_filename +from find_filename import find_model_filename def train_model(ml_model, method): - train_data_file = os.path.join(os.path.dirname(__file__), - 'datasets', 'train', - f'{method}_train_dataset.txt') - hyperparams_file = os.path.join(os.path.dirname(__file__), - 'config', 'hyperparams', - f'{method}_{ml_model}') - with open(train_data_file, 'rb') as f: - method_x_train, method_y_train = pickle.load(f) - hyperparams = read_yaml_from_file(hyperparams_file) - current_classifier = classifiers[ml_model] - clf = current_classifier(**hyperparams) - clf.fit(method_x_train, method_y_train) - - -# print(train_model(ml_models[1], dataset_types[0])) \ No newline at end of file + train_data_filename = find_dataset_filename('train', method=method) + hyperparams_file = find_hyperparams_filename(method, ml_model) + with open(train_data_filename, 'rb') as train_data_file: + x_train, y_train = pickle.load(train_data_file) + hyperparams = read_yaml_from_file(hyperparams_file) + current_classifier = classifiers[ml_model] + clf = current_classifier(**hyperparams) + clf.fit(x_train, y_train) + trained_model_filename = find_model_filename(method, ml_model) + with open(trained_model_filename, 'wb') as trained_model_file: + pickle.dump(clf, trained_model_file)