Skip to content

Commit

Permalink
Cleaning code
Browse files Browse the repository at this point in the history
delriot committed Apr 4, 2023
1 parent c37433d commit 3300703
Showing 14 changed files with 119 additions and 104 deletions.
3 changes: 3 additions & 0 deletions basic_ml.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""NOT IN USE"""


"""Contains a function to do some basic machine learning."""
import numpy as np
from tensorflow import keras
9 changes: 1 addition & 8 deletions choose_hyperparams.py
Original file line number Diff line number Diff line change
@@ -17,20 +17,13 @@
import os
import pickle
import csv
import yaml
import importlib.util
from config.ml_models import ml_models
from config.ml_models import classifiers
from config.ml_models import dataset_types
from config.hyperparameters_grid import grid
from sklearn.model_selection import GridSearchCV


def write_yaml_to_file(py_obj, filename):
with open(f'{filename}.yaml', 'w',) as f:
yaml.dump(py_obj, f, sort_keys=False)
print('Written to file successfully')

from yaml_tools import read_yaml_from_file

def k_folds_ml(x_train, y_train, model, random_state=0):
"""
49 changes: 32 additions & 17 deletions create_clean_dataset.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,41 @@
import pickle
import numpy as np
from replicating_Dorians_features import extract_features
from basic_ml import use_tf, basic_ml
from itertools import product
import sys
import os
import csv
import importlib
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
from dataset_manipulation import name_unique_features
from dataset_manipulation import remove_notunique_features
from dataset_manipulation import balance_dataset
from dataset_manipulation import augmentate_dataset
else:
from packages.dataset_manipulation import name_unique_features
from packages.dataset_manipulation import remove_notunique_features
from packages.dataset_manipulation import balance_dataset
from packages.dataset_manipulation import augmentate_dataset


dataset_file = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
f = open(dataset_file, 'rb')
dataset = pickle.load(f)
original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)
dataset_filename = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
clean_dataset_filename = os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')

# working with raw features
features = np.array(features_list)
targets = np.array(targets_list)
timings = np.array(timings_list)
original_polys = np.array(original_polys_list)

clean_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')
g = open(clean_dataset_file, 'wb')
dataset = pickle.dump((original_polys, names, features, targets, timings), g)
def cleaning_dataset(dataset_filename, clean_dataset_filename):
with open(dataset_filename, 'rb') as f:
dataset = pickle.load(f)
original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)

# working with raw features
features = np.array(features_list)
unique_names, unique_features = remove_notunique_features(names, features)

targets = np.array(targets_list)
timings = np.array(timings_list)
original_polys = np.array(original_polys_list)

with open(clean_dataset_filename, 'wb') as g:
dataset = pickle.dump((original_polys, unique_names, unique_features, targets, timings), g)

cleaning_dataset(dataset_filename, clean_dataset_filename)
Binary file modified datasets/clean_dataset.txt
Binary file not shown.
4 changes: 2 additions & 2 deletions datasets/dataset_instances.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dataset,zero,one,two,three,four,five,total
train normal dataset,326,74,105,41,163,106,815
train balanced dataset,126,113,149,138,144,145,815
train balanced dataset,151,121,136,152,133,122,815
train augmented dataset,815,815,815,815,815,815,4890
test normal dataset,80,19,30,10,39,26,204
test balanced dataset,31,34,32,38,34,35,204
test balanced dataset,29,27,32,48,34,34,204
test augmented dataset,204,204,204,204,204,204,1224
Binary file modified datasets/test/augmented_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/balanced_test_dataset.txt
Binary file not shown.
Binary file modified datasets/test/normal_test_dataset.txt
Binary file not shown.
Binary file modified datasets/train/augmented_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/balanced_train_dataset.txt
Binary file not shown.
Binary file modified datasets/train/normal_train_dataset.txt
Binary file not shown.
17 changes: 16 additions & 1 deletion packages/dataset_manipulation/dataset_manipulation.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
import math
import random
from .exploit_symmetries import give_all_symmetries
from sklearn.preprocessing import normalize

nvar = 3

@@ -73,10 +74,24 @@ def name_unique_features(names, features):
return new_names


def remove_notunique_features(unique_names, names, features):
def get_unique_feature_names(unique_names, names, features):
"""Return the features corresponding to a name in 'unique_names'."""
unique_features = []
for index, feature in enumerate(zip(*features)):
if names[index] in unique_names:
unique_features.append(feature)
return np.transpose(unique_features)


def remove_notunique_features(names, features):
# creating some targets and timing because the function requires them
targets = [0]*len(features)
timings = [[0,0]]*len(features)
augmented_features, _, _ = augmentate_dataset(features, targets, timings)
# normalized_augmented_features = normalize(augmented_features)
unique_names = name_unique_features(names, augmented_features)
unique_features = []
for index, feature in enumerate(zip(*features)):
if names[index] in unique_names:
unique_features.append(feature)
return unique_names, np.transpose(unique_features)
106 changes: 44 additions & 62 deletions test_train_datasets.py
Original file line number Diff line number Diff line change
@@ -36,65 +36,47 @@


def count_instances(my_dataset, instance):
return sum(my_dataset==instance)


names_features_targets_file = os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')
with open(names_features_targets_file, 'rb') as f:
original_polys, names, features, targets, timings = pickle.load(f)

augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)

normalized_augmented_features = normalize(augmented_features)
unique_names = name_unique_features(names,
augmented_features)

random_state = 0

x = dict() # to keep the features
y = dict() # to keep the labels
t = dict() # to keep the timings
# train and test sets are created
not_unique_x_normal_train, not_unique_x_normal_test, y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(features, targets, timings,
test_size=0.20,
random_state=random_state)

not_unique_balanced_x_test, y['test_balanced'], t['test_balanced'] = balance_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
x['test_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_test)
# testing data for all approaches is ready
# all tests will be done in balanced but the others are also computed
not_unique_augmented_x_test, y['test_augmented'], t['test_augmented'] = augmentate_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
x['test_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_test)
x['test_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_test)

x['train_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_train)
# normal training data ready
not_unique_balanced_x_train, y['train_balanced'], t['train_balanced'] = balance_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
x['train_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_train)
# balanced training data ready
not_unique_augmented_x_train, y['train_augmented'], t['train_augmented'] = augmentate_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
x['train_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_train)
# augmented training data ready


dataset_info_file = os.path.join(os.path.dirname(__file__),
'datasets',
'dataset_instances.csv')
with open(dataset_info_file, 'w') as f_dataset_info:
writer = csv.writer(f_dataset_info)
writer.writerow(['dataset'] + ['zero','one','two','three','four','five','total'])
for usage in ['train', 'test']:
for method in ['normal', 'balanced', 'augmented']:
print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
this_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', usage,
f'{method}_{usage}_dataset.txt')
with open(this_dataset_file, 'wb') as f:
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)

writer.writerow([f'{usage} {method} dataset']
+ [str(count_instances(y[f'{usage}_{method}'], i))
for i in range(6)]
+ [str(len(y[f'{usage}_{method}']))])
return sum(my_dataset == instance)


def create_train_test_datasets(clean_dataset_filename):
with open(clean_dataset_filename, 'rb') as clean_dataset_file:
_, names, features, targets, timings = pickle.load(clean_dataset_file)

x = dict() # to keep the features
y = dict() # to keep the labels
t = dict() # to keep the timings
# train and test sets are created
x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
test_size=0.20,
random_state=random_state)
for purpose in ['train', 'test']:
x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])


dataset_info_file = os.path.join(os.path.dirname(__file__),
'datasets',
'dataset_instances.csv')
with open(dataset_info_file, 'w') as f_dataset_info:
writer = csv.writer(f_dataset_info)
writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total'])
for usage in ['train', 'test']:
for method in ['normal', 'balanced', 'augmented']:
print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
this_dataset_file = os.path.join(os.path.dirname(__file__),
'datasets', usage,
f'{method}_{usage}_dataset.txt')
with open(this_dataset_file, 'wb') as f:
pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)

writer.writerow([f'{usage} {method} dataset']
+ [str(count_instances(y[f'{usage}_{method}'], i))
for i in range(6)]
+ [str(len(y[f'{usage}_{method}']))])


# clean_dataset_filename = os.path.join(os.path.dirname(__file__),
# 'datasets',
# 'clean_dataset.txt')
# create_train_test_datasets(clean_dataset_filename)
35 changes: 21 additions & 14 deletions train_models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import yaml
from yaml import UnsafeLoader
import os
from config.ml_models import ml_models
from config.ml_models import dataset_types

print(ml_models)
for ml_model in ml_models:
for method in dataset_types:
filename = os.path.join(os.path.dirname(__file__),
'config', 'hyperparams',
f'{method}_{ml_model}.yaml')
with open(filename, 'r') as f:
hyperparameters = yaml.load(f, Loader=UnsafeLoader)
print(type(hyperparameters), hyperparameters)
import pickle
from yaml_tools import read_yaml_from_file
from config.ml_models import classifiers


def train_model(ml_model, method):
train_data_file = os.path.join(os.path.dirname(__file__),
'datasets', 'train',
f'{method}_train_dataset.txt')
hyperparams_file = os.path.join(os.path.dirname(__file__),
'config', 'hyperparams',
f'{method}_{ml_model}')
with open(train_data_file, 'rb') as f:
method_x_train, method_y_train = pickle.load(f)
hyperparams = read_yaml_from_file(hyperparams_file)
current_classifier = classifiers[ml_model]
clf = current_classifier(**hyperparams)
clf.fit(method_x_train, method_y_train)


# print(train_model(ml_models[1], dataset_types[0]))

0 comments on commit 3300703

Please sign in to comment.