diff --git a/basic_ml.py b/basic_ml.py index 941300f..728b15a 100644 --- a/basic_ml.py +++ b/basic_ml.py @@ -1,3 +1,6 @@ +"""NOT IN USE""" + + """Contains a function to do some basic machine learning.""" import numpy as np from tensorflow import keras diff --git a/choose_hyperparams.py b/choose_hyperparams.py index 67a634c..6cc9d65 100644 --- a/choose_hyperparams.py +++ b/choose_hyperparams.py @@ -17,20 +17,13 @@ import os import pickle import csv -import yaml import importlib.util from config.ml_models import ml_models from config.ml_models import classifiers from config.ml_models import dataset_types from config.hyperparameters_grid import grid from sklearn.model_selection import GridSearchCV - - -def write_yaml_to_file(py_obj, filename): - with open(f'{filename}.yaml', 'w',) as f: - yaml.dump(py_obj, f, sort_keys=False) - print('Written to file successfully') - +from yaml_tools import read_yaml_from_file def k_folds_ml(x_train, y_train, model, random_state=0): """ diff --git a/create_clean_dataset.py b/create_clean_dataset.py index d447580..a43b707 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -1,26 +1,41 @@ import pickle import numpy as np from replicating_Dorians_features import extract_features -from basic_ml import use_tf, basic_ml -from itertools import product import sys import os -import csv +import importlib +if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): + from dataset_manipulation import name_unique_features + from dataset_manipulation import remove_notunique_features + from dataset_manipulation import balance_dataset + from dataset_manipulation import augmentate_dataset +else: + from packages.dataset_manipulation import name_unique_features + from packages.dataset_manipulation import remove_notunique_features + from packages.dataset_manipulation import balance_dataset + from packages.dataset_manipulation import augmentate_dataset -dataset_file = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt') -f = open(dataset_file, 'rb') -dataset = pickle.load(f) -original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset) +dataset_filename = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt') +clean_dataset_filename = os.path.join(os.path.dirname(__file__), + 'datasets', + 'clean_dataset.txt') -# working with raw features -features = np.array(features_list) -targets = np.array(targets_list) -timings = np.array(timings_list) -original_polys = np.array(original_polys_list) -clean_dataset_file = os.path.join(os.path.dirname(__file__), - 'datasets', - 'clean_dataset.txt') -g = open(clean_dataset_file, 'wb') -dataset = pickle.dump((original_polys, names, features, targets, timings), g) +def cleaning_dataset(dataset_filename, clean_dataset_filename): + with open(dataset_filename, 'rb') as f: + dataset = pickle.load(f) + original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset) + + # working with raw features + features = np.array(features_list) + unique_names, unique_features = remove_notunique_features(names, features) + + targets = np.array(targets_list) + timings = np.array(timings_list) + original_polys = np.array(original_polys_list) + + with open(clean_dataset_filename, 'wb') as g: + dataset = pickle.dump((original_polys, unique_names, unique_features, targets, timings), g) + +cleaning_dataset(dataset_filename, clean_dataset_filename) diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt index 333c663..5be0dc0 100644 Binary files a/datasets/clean_dataset.txt and b/datasets/clean_dataset.txt differ diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv index 6d1dc65..d23ccbb 100644 --- a/datasets/dataset_instances.csv +++ b/datasets/dataset_instances.csv @@ -1,7 +1,7 @@ dataset,zero,one,two,three,four,five,total train normal dataset,326,74,105,41,163,106,815 -train balanced dataset,126,113,149,138,144,145,815 +train balanced dataset,151,121,136,152,133,122,815 train augmented dataset,815,815,815,815,815,815,4890 test normal dataset,80,19,30,10,39,26,204 -test balanced dataset,31,34,32,38,34,35,204 +test balanced dataset,29,27,32,48,34,34,204 test augmented dataset,204,204,204,204,204,204,1224 diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index 5f66bb2..db064a4 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index e9b91f0..1684129 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt index 8d72352..f3ae7f2 100644 Binary files a/datasets/test/normal_test_dataset.txt and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index 00989a1..754502e 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 731c787..6ccb005 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt index e9096f9..9b7f9af 100644 Binary files a/datasets/train/normal_train_dataset.txt and b/datasets/train/normal_train_dataset.txt differ diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py index 4a5f977..cdc82d4 100644 --- a/packages/dataset_manipulation/dataset_manipulation.py +++ b/packages/dataset_manipulation/dataset_manipulation.py @@ -3,6 +3,7 @@ import math import random from .exploit_symmetries import give_all_symmetries +from sklearn.preprocessing import normalize nvar = 3 @@ -73,10 +74,24 @@ def name_unique_features(names, features): return new_names -def remove_notunique_features(unique_names, names, features): +def get_unique_feature_names(unique_names, names, features): """Return the features corresponding to a name in 'unique_names'.""" unique_features = [] for index, feature in enumerate(zip(*features)): if names[index] in unique_names: unique_features.append(feature) return np.transpose(unique_features) + + +def remove_notunique_features(names, features): + # creating some targets and timing because the function requires them + targets = [0]*len(features) + timings = [[0,0]]*len(features) + augmented_features, _, _ = augmentate_dataset(features, targets, timings) + # normalized_augmented_features = normalize(augmented_features) + unique_names = name_unique_features(names, augmented_features) + unique_features = [] + for index, feature in enumerate(zip(*features)): + if names[index] in unique_names: + unique_features.append(feature) + return unique_names, np.transpose(unique_features) diff --git a/test_train_datasets.py b/test_train_datasets.py index 9a80cff..5f3ea7b 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -36,65 +36,47 @@ def count_instances(my_dataset, instance): - return sum(my_dataset==instance) - - -names_features_targets_file = os.path.join(os.path.dirname(__file__), - 'datasets', - 'clean_dataset.txt') -with open(names_features_targets_file, 'rb') as f: - original_polys, names, features, targets, timings = pickle.load(f) - -augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings) - -normalized_augmented_features = normalize(augmented_features) -unique_names = name_unique_features(names, - augmented_features) - -random_state = 0 - -x = dict() # to keep the features -y = dict() # to keep the labels -t = dict() # to keep the timings -# train and test sets are created -not_unique_x_normal_train, not_unique_x_normal_test, y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(features, targets, timings, - test_size=0.20, - random_state=random_state) - -not_unique_balanced_x_test, y['test_balanced'], t['test_balanced'] = balance_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal']) -x['test_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_test) -# testing data for all approaches is ready -# all tests will be done in balanced but the others are also computed -not_unique_augmented_x_test, y['test_augmented'], t['test_augmented'] = augmentate_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal']) -x['test_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_test) -x['test_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_test) - -x['train_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_train) -# normal training data ready -not_unique_balanced_x_train, y['train_balanced'], t['train_balanced'] = balance_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal']) -x['train_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_train) -# balanced training data ready -not_unique_augmented_x_train, y['train_augmented'], t['train_augmented'] = augmentate_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal']) -x['train_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_train) -# augmented training data ready - - -dataset_info_file = os.path.join(os.path.dirname(__file__), - 'datasets', - 'dataset_instances.csv') -with open(dataset_info_file, 'w') as f_dataset_info: - writer = csv.writer(f_dataset_info) - writer.writerow(['dataset'] + ['zero','one','two','three','four','five','total']) - for usage in ['train', 'test']: - for method in ['normal', 'balanced', 'augmented']: - print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}'])) - this_dataset_file = os.path.join(os.path.dirname(__file__), - 'datasets', usage, - f'{method}_{usage}_dataset.txt') - with open(this_dataset_file, 'wb') as f: - pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f) - - writer.writerow([f'{usage} {method} dataset'] - + [str(count_instances(y[f'{usage}_{method}'], i)) - for i in range(6)] - + [str(len(y[f'{usage}_{method}']))]) \ No newline at end of file + return sum(my_dataset == instance) + + +def create_train_test_datasets(clean_dataset_filename): + with open(clean_dataset_filename, 'rb') as clean_dataset_file: + _, names, features, targets, timings = pickle.load(clean_dataset_file) + + x = dict() # to keep the features + y = dict() # to keep the labels + t = dict() # to keep the timings + # train and test sets are created + x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings, + test_size=0.20, + random_state=random_state) + for purpose in ['train', 'test']: + x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) + x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) + + + dataset_info_file = os.path.join(os.path.dirname(__file__), + 'datasets', + 'dataset_instances.csv') + with open(dataset_info_file, 'w') as f_dataset_info: + writer = csv.writer(f_dataset_info) + writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total']) + for usage in ['train', 'test']: + for method in ['normal', 'balanced', 'augmented']: + print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}'])) + this_dataset_file = os.path.join(os.path.dirname(__file__), + 'datasets', usage, + f'{method}_{usage}_dataset.txt') + with open(this_dataset_file, 'wb') as f: + pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f) + + writer.writerow([f'{usage} {method} dataset'] + + [str(count_instances(y[f'{usage}_{method}'], i)) + for i in range(6)] + + [str(len(y[f'{usage}_{method}']))]) + + +# clean_dataset_filename = os.path.join(os.path.dirname(__file__), +# 'datasets', +# 'clean_dataset.txt') +# create_train_test_datasets(clean_dataset_filename) diff --git a/train_models.py b/train_models.py index abebcd1..86630ac 100644 --- a/train_models.py +++ b/train_models.py @@ -1,15 +1,22 @@ -import yaml -from yaml import UnsafeLoader import os -from config.ml_models import ml_models -from config.ml_models import dataset_types - -print(ml_models) -for ml_model in ml_models: - for method in dataset_types: - filename = os.path.join(os.path.dirname(__file__), - 'config', 'hyperparams', - f'{method}_{ml_model}.yaml') - with open(filename, 'r') as f: - hyperparameters = yaml.load(f, Loader=UnsafeLoader) - print(type(hyperparameters), hyperparameters) +import pickle +from yaml_tools import read_yaml_from_file +from config.ml_models import classifiers + + +def train_model(ml_model, method): + train_data_file = os.path.join(os.path.dirname(__file__), + 'datasets', 'train', + f'{method}_train_dataset.txt') + hyperparams_file = os.path.join(os.path.dirname(__file__), + 'config', 'hyperparams', + f'{method}_{ml_model}') + with open(train_data_file, 'rb') as f: + method_x_train, method_y_train = pickle.load(f) + hyperparams = read_yaml_from_file(hyperparams_file) + current_classifier = classifiers[ml_model] + clf = current_classifier(**hyperparams) + clf.fit(method_x_train, method_y_train) + + +# print(train_model(ml_models[1], dataset_types[0])) \ No newline at end of file