Skip to content

Commit

Permalink
Added some essential files
Browse files Browse the repository at this point in the history
Tereso del Rio committed Sep 14, 2023
1 parent 94aaa66 commit f48eafb
Showing 7 changed files with 373 additions and 0 deletions.
3 changes: 3 additions & 0 deletions config/general_values.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

purposes = ['Train', 'Test']
dataset_qualities = ['Normal', 'Balanced', 'Augmented']
70 changes: 70 additions & 0 deletions config/hyperparameters_grid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Contains the grid of hyperparameters that each model will try"""

grid = dict()
grid['RF'] = {
'n_estimators': [200, 300, 400, 500],
'max_features': ['sqrt', 'log2'],
'max_depth': [4, 5, 6, 7, 8],
'criterion': ['gini', 'entropy']
}
grid['KNN'] = {
'n_neighbors': [1,3,5,7,12],
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#'leaf_size': range(1, 10, 3),
#'p': range(1, 4, 1)
}
grid['MLP'] = {
'hidden_layer_sizes': [(5,5), (15,15), (20,20), (10,10,10), (20,20,20)], #[(i,i) for i in range(50, 20, 5)],# +[(i,i, i) for i in range(50, 20, 5)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'learning_rate': ['constant','adaptive'],
'alpha': [0.05, 0.005],
'max_iter': [1000]
}
grid['DT'] = {
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'max_depth': [1,4,7,10,13,16,19]
}
grid['SVC'] = {
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'tol': [0.0316],
'C': [5,100,200,300],
'gamma': ['scale', 'auto']
}

grid['RFR'] = {
'criterion': ['squared_error', 'friedman_mse'],
"max_depth": [1,3,7],
"min_samples_leaf": [1,5,10],
}
grid['KNNR'] = {
'n_neighbors': [3, 5, 10],
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
grid['MLPR'] = {
'hidden_layer_sizes': [(100,), (20, 20), (10, 10, 10)],
'activation': ['logistic', 'tanh', 'relu'],
'solver': ['adam', 'sgd'],
'alpha': [0.0001, 0.001, 0.01]
}
grid['DTR'] = {
"splitter":["best","random"],
"max_depth" : [1,3,7,12],
"min_samples_leaf":[1,5,10],
# "min_weight_fraction_leaf":[0.1,0.5,0.9],
# "max_features":["auto","log2","sqrt",None],
# "max_leaf_nodes":[None,10,50,90]
}
grid['SVR'] = {
'kernel': ('linear', 'rbf','poly'),
'C':[1.5, 10],
'gamma': [1e-7, 1e-4],
'epsilon':[0.1,0.2,0.5,0.3]
}
grid['SGD'] = {
'loss':["squared_error", "huber", "epsilon_insensitive"],
'penalty':["l2", "l1", "elasticnet"]
}
42 changes: 42 additions & 0 deletions config/ml_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Contains the ml models that will be used in the project"""

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

ml_models = [
'KNN',
'DT',
'SVC',
'RF',
'MLP'
]

ml_regressors = [
'DTR',
'SVR',
'RFR',
'KNNR',
'MLPR'
]

sklearn_models = {
'DT': DecisionTreeClassifier,
'KNN': KNeighborsClassifier,
'RF': RandomForestClassifier,
'SVC': SVC,
'MLP': MLPClassifier,
'DTR': DecisionTreeRegressor,
'KNNR': KNeighborsRegressor,
'RFR': RandomForestRegressor,
'SVR': SVR,
'MLPR': MLPRegressor
}
49 changes: 49 additions & 0 deletions find_filename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
from config.general_values import dataset_qualities
from config.general_values import purposes


def find_hyperparams_filename(method, ml_model):
return os.path.join(os.path.dirname(__file__),
'config', 'hyperparams',
f'{method}_{ml_model}')


def find_model_filename(method, ml_model):
return os.path.join(os.path.dirname(__file__),
'config', 'models',
f'{method}_{ml_model}.txt')


def find_dataset_filename(purpose, method=None):
if purpose == "unclean":
return os.path.join(os.path.dirname(__file__),
'DatasetsBeforeProcessing',
'dataset_without_repetition_return_ncells.txt')
# 'dataset_with_repetition_return_ncells.txt')
# for returning "repeated" instances
# those with the same number of cells for all projections
elif purpose == "clean":
return os.path.join(os.path.dirname(__file__),
'datasets',
'clean_dataset.txt')
elif purpose == 'instances':
return os.path.join(os.path.dirname(__file__),
'datasets',
'dataset_instances.csv')
elif purpose in purposes:
return os.path.join(os.path.dirname(__file__),
'datasets', f'{purpose}',
f'{method}_{purpose}_dataset.txt')
else:
raise Exception(f"Purpose {purpose} not found")


def find_output_filename(training_method):
return os.path.join(os.path.dirname(__file__), 'results',
f'ml_trained_in_{training_method}.csv')


def find_other_filename(search):
return os.path.join(os.path.dirname(__file__), 'config',
f'{search}.txt')
46 changes: 46 additions & 0 deletions preprocessing_Dorians_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""IS THIS BEING USED?"""
import numpy as np
from sklearn.preprocessing import normalize


def convert_to_numpy_floats(features):
return np.array([np.array([np.float64(feature)
for feature in feature_list])
for feature_list in features])


def normalize_features(features):
"""
Normalize each column of features.

The new media is 0 and the standard deviation is 1 in each column.
"""
normal_features = []
for feature in zip(*features):
mean = np.mean(feature)
std = np.std(feature)
if std != 0:
normal_features.append((feature - mean) / std)
else:
normal_features.append(feature - mean)
return normal_features


def normalize_features2(features):
"""
Normalize each column of features.

The new media is 0 and the standard deviation is 1 in each column.
"""
return normalize(features, axis=0)




# v = convert_to_numpy_floats([[2,1,4,1,41],[3,1,142,12,1],[21,12,34,123,2]])
# print(v[0,1])

# print(normalize_features2(v)==normalize_features(v))

# print(normalize_features2(v))
# print(normalize_features(v))
148 changes: 148 additions & 0 deletions test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import csv
import pickle
import importlib.util
import numpy as np
from sklearn import metrics
from config.general_values import dataset_qualities
from config.ml_models import ml_models
from find_filename import find_output_filename
from find_filename import find_dataset_filename
from find_filename import find_model_filename
# Check if 'dataset_manipulation' is installed
if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
from exploit_symmetries import give_all_symmetries
else:
from packages.dataset_manipulation.exploit_symmetries import give_all_symmetries


# def test_model(trained_model_filename, test_dataset_filename):
# with open(trained_model_filename, 'rb') as trained_model_file:
# model = pickle.load(trained_model_file)
# with open(test_dataset_filename, 'rb') as test_dataset_file:
# x_test, y_test, _ = pickle.load(test_dataset_file)
# y_pred = model.predict(x_test)
# return metrics.accuracy_score(y_test, y_pred)


def test_results(training_method):
output_filename = find_output_filename(training_method)
with open(output_filename, 'w') as output_file:
writer_balanced = csv.writer(output_file)
writer_balanced.writerow(["Name"] + dataset_qualities)
for ml_model in ml_models:
trained_model_filename = find_model_filename(training_method,
ml_model)
accuracy = dict()
for testing_method in dataset_qualities:
test_dataset_filename = find_dataset_filename('Test',
testing_method)
accuracy[testing_method] = test_model(trained_model_filename,
test_dataset_filename)
print('testing_method', testing_method)
print('ml_model', ml_model)
print('acc', accuracy[testing_method])
round_accuracies = [round(acc, 2)
for acc in [accuracy[method]
for method in dataset_qualities]]
writer_balanced.writerow([ml_model + "-" + training_method] +
round_accuracies)


def test_classifier(ml_model, testing_method='augmented'):
trained_model_filename = find_model_filename('classification',
ml_model)
test_dataset_filename = find_dataset_filename('Test',
testing_method)
with open(trained_model_filename, 'rb') as trained_model_file:
model = pickle.load(trained_model_file)
with open(test_dataset_filename, 'rb') as test_dataset_file:
x_test, y_test, all_timings = pickle.load(test_dataset_file)
chosen_indices = [return_regressor_choice(model, features) for features in x_test]
return compute_metrics(chosen_indices, y_test, all_timings)


def timings_in_test(model, testing_method='augmented', training_method=None):
test_dataset_filename = find_dataset_filename('test',
testing_method)
with open(test_dataset_filename, 'rb') as test_dataset_file:
x_test, _, all_timings = pickle.load(test_dataset_file)
if model == 'optimal':
t_pred = [min(timings) for timings in all_timings]
else:
trained_model_filename = find_model_filename(training_method,
model)
with open(trained_model_filename, 'rb') as trained_model_file:
model = pickle.load(trained_model_file)
y_pred = model.predict(x_test)
# This doesn't work because agumenteed and balanced
# only return one timing, not 6
t_pred = [timings[y] for timings, y in zip(all_timings, y_pred)]
return t_pred


def test_regressor(ml_model):
trained_model_filename = find_model_filename('regression',
ml_model)
test_dataset_filename = find_dataset_filename('test',
'regression')
with open(trained_model_filename, 'rb') as trained_model_file:
model = pickle.load(trained_model_file)
with open(test_dataset_filename, 'rb') as test_dataset_file:
x_test, y_test, all_timings = pickle.load(test_dataset_file)
y_pred = model.predict(x_test)
avg_error = sum([abs(p-t) for p, t in zip(y_pred, y_test)])/len(y_pred)
print(f"{ml_model} gave {avg_error}")


def test_model(ml_model, paradigm, testing_method='augmented'):
trained_model_filename = find_model_filename(paradigm,
ml_model)
print(trained_model_filename, paradigm, ml_model)
test_dataset_filename = find_dataset_filename('Test',
testing_method)
with open(trained_model_filename, 'rb') as trained_model_file:
model = pickle.load(trained_model_file)
with open(test_dataset_filename, 'rb') as test_dataset_file:
testing_dataset = pickle.load(test_dataset_file)
chosen_indices = [return_regressor_choice(model, features)
for features in testing_dataset['features']]
return compute_metrics(chosen_indices,
testing_dataset['labels'],
testing_dataset['timings'],
testing_dataset['cells'])


def compute_metrics(chosen_indices, labels, all_timings, all_cells):
metrics = dict()
correct = 0
metrics['Total time'] = 0
total_markup = 0
metrics['Completed'] = 0
metrics['Total cells'] = 0
for chosen_index, label, timings, cells in \
zip(chosen_indices, labels, all_timings, all_cells):
if chosen_index == label:
correct += 1
if timings[chosen_index] not in [30, 60]:
metrics['Completed'] += 1
metrics['Total time'] += timings[chosen_index]
total_markup += (timings[chosen_index]-timings[label])/(timings[label] + 1)
metrics['Total cells'] += cells[chosen_index]
total_instances = len(chosen_indices)
metrics['Accuracy'] = correct/total_instances
metrics['Markup'] = total_markup/total_instances
return metrics


def return_regressor_choice(model, features):
features_all_symmetries = give_all_symmetries(features)
y_op = float('inf')
for index, x_features in enumerate(features_all_symmetries):
# print(x_features)
y_pred = model.predict([x_features])
# print(y_pred)
if y_op > y_pred:
y_op = y_pred
index_op = index
# print(index_op)
return index_op
15 changes: 15 additions & 0 deletions yaml_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import yaml


def write_yaml_to_file(py_obj, filename):
with open(f'{filename}.yaml', 'w',) as f:
yaml.dump(py_obj, f, sort_keys=False)
print('Written to file successfully')


def read_yaml_from_file(filename):
with open(f'{filename}.yaml') as f:
# py_obj = yaml.safe_load(f)
py_obj = yaml.load(f, Loader=yaml.Loader)
print('Read from file successfully')
return py_obj

0 comments on commit f48eafb

Please sign in to comment.