Now the main function is very clear and runs everything

delriot · Apr 6, 2023 · f83a2ce · f83a2ce
1 parent 3300703
commit f83a2ce
Showing 12 changed files with 71 additions and 173 deletions.
diff --git a/choose_hyperparams.py b/choose_hyperparams.py
@@ -17,13 +17,15 @@
 import os
 import pickle
 import csv
-import importlib.util
 from config.ml_models import ml_models
 from config.ml_models import classifiers
 from config.ml_models import dataset_types
 from config.hyperparameters_grid import grid
 from sklearn.model_selection import GridSearchCV
-from yaml_tools import read_yaml_from_file
+from yaml_tools import write_yaml_to_file
+from find_filename import find_dataset_filename
+from find_filename import find_hyperparams_filename
+
 
 def k_folds_ml(x_train, y_train, model, random_state=0):
     """
@@ -40,6 +42,17 @@ def k_folds_ml(x_train, y_train, model, random_state=0):
     return rf_cv.best_params_
 
 
+def choose_hyperparams(ml_model, method):
+    """Given a ml_model and a method, a file with the hyperparameters
+    chosen by cross validation is created"""
+    this_dataset_file = find_dataset_filename('train', method=method)
+    with open(this_dataset_file, 'rb') as f:
+        method_x_train, method_y_train = pickle.load(f)
+    hyperparams = k_folds_ml(method_x_train, method_y_train, model=ml_model)
+    hyperparams_filename = find_hyperparams_filename(method, ml_model)
+    write_yaml_to_file(hyperparams, hyperparams_filename)
+
+
 test_balanced_dataset_file = os.path.join(os.path.dirname(__file__),
                                           'datasets', 'test',
                                           'balanced_test_dataset.txt')

diff --git a/create_clean_dataset.py b/create_clean_dataset.py
@@ -1,25 +1,11 @@
 import pickle
 import numpy as np
 from replicating_Dorians_features import extract_features
-import sys
-import os
 import importlib
 if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
-    from dataset_manipulation import name_unique_features
     from dataset_manipulation import remove_notunique_features
-    from dataset_manipulation import balance_dataset
-    from dataset_manipulation import augmentate_dataset
 else:
-    from packages.dataset_manipulation import name_unique_features
     from packages.dataset_manipulation import remove_notunique_features
-    from packages.dataset_manipulation import balance_dataset
-    from packages.dataset_manipulation import augmentate_dataset
-
-
-dataset_filename = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
-clean_dataset_filename = os.path.join(os.path.dirname(__file__),
-                                    'datasets',
-                                    'clean_dataset.txt')
 
 
 def cleaning_dataset(dataset_filename, clean_dataset_filename):
@@ -35,7 +21,16 @@ def cleaning_dataset(dataset_filename, clean_dataset_filename):
     timings = np.array(timings_list)
     original_polys = np.array(original_polys_list)
 
-    with open(clean_dataset_filename, 'wb') as g:
-        dataset = pickle.dump((original_polys, unique_names, unique_features, targets, timings), g)
+    with open(clean_dataset_filename, 'wb') as clean_dataset_file:
+        dataset = pickle.dump((original_polys, unique_names,
+                               unique_features, targets, timings),
+                              clean_dataset_file)
+
 
-cleaning_dataset(dataset_filename, clean_dataset_filename)
+# dataset_filename = os.path.join(os.path.dirname(__file__),
+#                                 'DatasetsBeforeProcessing',
+#                                 'dataset_without_repetition_return_ncells.txt')
+# clean_dataset_filename = os.path.join(os.path.dirname(__file__),
+#                                       'datasets',
+#                                       'clean_dataset.txt')
+# cleaning_dataset(dataset_filename, clean_dataset_filename)
diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv
@@ -1,7 +1,7 @@
 dataset,zero,one,two,three,four,five,total
 train normal dataset,326,74,105,41,163,106,815
-train balanced dataset,151,121,136,152,133,122,815
+train balanced dataset,130,120,135,143,135,152,815
 train augmented dataset,815,815,815,815,815,815,4890
 test normal dataset,80,19,30,10,39,26,204
-test balanced dataset,29,27,32,48,34,34,204
+test balanced dataset,34,31,32,37,39,31,204
 test augmented dataset,204,204,204,204,204,204,1224

diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt
diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt
diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt
diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt
diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt
diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt
diff --git a/main.py b/main.py
@@ -12,125 +12,26 @@
 Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science,
 vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30
 """
+from config.ml_models import ml_models
+from config.ml_models import dataset_types
+from find_filename import find_dataset_filename
+from create_clean_dataset import cleaning_dataset
+from test_train_datasets import create_train_test_datasets
+from choose_hyperparams import choose_hyperparams
+from train_models import train_model
+from test_models import test_results
 
 
-import os
-import pickle
-import random
-import csv
-import yaml
-import importlib.util
-# Check if 'dataset_manipulation' is installed
-if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
-    from dataset_manipulation import name_unique_features
-    from dataset_manipulation import remove_notunique_features
-    from dataset_manipulation import balance_dataset
-    from dataset_manipulation import augmentate_dataset
-else:
-    from packages.dataset_manipulation import name_unique_features
-    from packages.dataset_manipulation import remove_notunique_features
-    from packages.dataset_manipulation import balance_dataset
-    from packages.dataset_manipulation import augmentate_dataset
-from sklearn.preprocessing import normalize
-from sklearn.model_selection import train_test_split
-from basic_ml import basic_ml
-from k_folds_ml import k_folds_ml
+original_dataset_file = find_dataset_filename('unclean')
+clean_dataset_filename = find_dataset_filename('clean')
+cleaning_dataset(original_dataset_file, clean_dataset_filename)
+create_train_test_datasets()
 
-def write_yaml_to_file(py_obj,filename):
-    with open(f'{filename}.yaml', 'w',) as f :
-        yaml.dump(py_obj,f,sort_keys=False) 
-    print('Written to file successfully')
-
-
-
-names_features_targets_file = os.path.join(os.path.dirname(__file__),
-                                           'datasets',
-                                           'clean_dataset.txt')
-with open(names_features_targets_file, 'rb') as f:
-    original_polys, names, features, targets, timings = pickle.load(f)
-
-
-augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)
-
-normalized_augmented_features = normalize(augmented_features)
-# an alternative approach to normalizing
-# features = np.transpose(normalize_features(features))
-unique_names = name_unique_features(names,
-                                    augmented_features)
-
-random_state = 0
-# Other random states may be tried to check that similar results are achieved
-random.seed(random_state)
-
-# Models that will be used are chosen
-ml_models = ['KNN', 'DT', 'MLP', 'SVC', 'RF'] # , 'my_mlp'
-
-# train and test sets are created
-x_train, x_test, y_train, y_test, t_train, t_test = train_test_split(features, targets, timings,
-                                                    test_size=0.20,
-                                                    random_state=random_state)
-# test features are balanced
-bal_x_test, bal_y_test, bal_t_test = balance_dataset(x_test, y_test, t_test)
-# and the repeated features are removed before presenting them to any ml_model
-# we will ensure that instances send to the models dont have repeated features
-unique_bal_x_test = remove_notunique_features(unique_names, names, bal_x_test)
-# testing data for all approaches is ready
-unique_x_train = remove_notunique_features(unique_names, names, x_train)
-# training data without changes ready
-bal_x_train, bal_y_train, bal_t_train = balance_dataset(x_train, y_train, t_train)
-unique_bal_x_train = remove_notunique_features(unique_names, names, bal_x_train)
-# balanced training data ready
-aug_x_train, aug_y_train, aug_t_train = augmentate_dataset(x_train, y_train, t_train)
-unique_aug_x_train = remove_notunique_features(unique_names, names, aug_x_train)
-# augmented training data ready
-
-# output_file = os.path.join(os.path.dirname(__file__),
-#                            'ml_results.csv')
-# with open(output_file, 'w') as f:
-#     writer = csv.writer(f)
-#     writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
-#     for ml_model in ml_models:
-#         acc_basic = basic_ml(unique_x_train, unique_bal_x_test,
-#                              y_train, bal_y_test,
-#                              ml_model, random_state=random_state)
-
-#         acc_bal = basic_ml(unique_bal_x_train, unique_bal_x_test,
-#                            bal_y_train, bal_y_test,
-#                            ml_model, random_state=random_state)
-
-#         acc_augmented = basic_ml(unique_aug_x_train, unique_bal_x_test,
-#                                  aug_y_train, bal_y_test,
-#                                  ml_model, random_state=random_state)
-
-#         round_accuracies = [round(acc, 2) for acc in [acc_basic,
-#                                                       acc_bal,
-#                                                       acc_augmented]]
-#         writer.writerow([ml_model] + round_accuracies)
-
-# output_file = os.path.join(os.path.dirname(__file__),
-#                            'ml_results_k_fold.csv')
-# with open(output_file, 'w') as f:
-#     writer = csv.writer(f)
-#     writer.writerow(["Name", "Normal", "Balance data", "Augment data"])
-#     print(f"{method}")
-        # print(f"The accuracies of {ml_model} are:\n Normal: {acc_basic} \n Balanced: {acc_bal}\n Augmented: {acc_augmented}")
-
-        # round_accuracies = [round(acc, 2) for acc in [acc_basic,
-        #                                               acc_bal,
-        #                                               acc_augmented]]
-        # writer.writerow([ml_model] + round_accuracies)
-
-x_and_y_per_method = dict()
-x_and_y_per_method['basic'] = (unique_x_train, y_train)
-x_and_y_per_method['balanced'] = (unique_bal_x_train, bal_y_train)
-x_and_y_per_method['augmented'] = (unique_aug_x_train, aug_y_train)
 for ml_model in ml_models:
-    print(f"Model: {ml_model}")
-    for method in ['basic', 'balanced', 'augmented']:
-        method_x_train, method_y_train = x_and_y_per_method[method]
-        hyperparams = k_folds_ml(method_x_train, method_y_train,
-                                        model=ml_model)
-        write_yaml_to_file(hyperparams,
-                            f'UsingDorianFeatures\\config\\hyperparams\\{method}_{ml_model}')
-        for train_data in ['basic', 'balanced']:
-            clf = ml_model()
+    for method in dataset_types:
+        choose_hyperparams(ml_model, method)
+for ml_model in ml_models:
+    for method in dataset_types:
+        train_model(ml_model, method)
+for testing_method in ['normal', 'balanced']:
+    test_results(testing_method)
diff --git a/test_train_datasets.py b/test_train_datasets.py
@@ -16,57 +16,51 @@
 
 import os
 import pickle
-import random
 import csv
-import yaml
 import importlib.util
 # Check if 'dataset_manipulation' is installed
 if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
-    from dataset_manipulation import name_unique_features
     from dataset_manipulation import remove_notunique_features
     from dataset_manipulation import balance_dataset
     from dataset_manipulation import augmentate_dataset
 else:
-    from packages.dataset_manipulation import name_unique_features
     from packages.dataset_manipulation import remove_notunique_features
     from packages.dataset_manipulation import balance_dataset
     from packages.dataset_manipulation import augmentate_dataset
-from sklearn.preprocessing import normalize
 from sklearn.model_selection import train_test_split
+from find_filename import find_dataset_filename
 
 
 def count_instances(my_dataset, instance):
     return sum(my_dataset == instance)
 
 
-def create_train_test_datasets(clean_dataset_filename):
+def create_train_test_datasets():
+    clean_dataset_filename = find_dataset_filename('clean')
     with open(clean_dataset_filename, 'rb') as clean_dataset_file:
         _, names, features, targets, timings = pickle.load(clean_dataset_file)
+    unique_names, unique_features = remove_notunique_features(names, features)
 
     x = dict()  # to keep the features
     y = dict()  # to keep the labels
     t = dict()  # to keep the timings
     # train and test sets are created
+    random_state = 0
     x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
                                                                                                                                     test_size=0.20,
                                                                                                                                     random_state=random_state)
     for purpose in ['train', 'test']:
         x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
         x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
-
-
-    dataset_info_file = os.path.join(os.path.dirname(__file__),
-                                     'datasets',
-                                     'dataset_instances.csv')
+    dataset_info_file = find_dataset_filename('instances')
     with open(dataset_info_file, 'w') as f_dataset_info:
         writer = csv.writer(f_dataset_info)
         writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total'])
         for usage in ['train', 'test']:
             for method in ['normal', 'balanced', 'augmented']:
-                print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
                 this_dataset_file = os.path.join(os.path.dirname(__file__),
-                                                'datasets', usage,
-                                                f'{method}_{usage}_dataset.txt')
+                                                 'datasets', usage,
+                                                 f'{method}_{usage}_dataset.txt')
                 with open(this_dataset_file, 'wb') as f:
                     pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)
 
@@ -76,7 +70,4 @@ def create_train_test_datasets(clean_dataset_filename):
                                 + [str(len(y[f'{usage}_{method}']))])
 
 
-# clean_dataset_filename = os.path.join(os.path.dirname(__file__),
-#                                            'datasets',
-#                                            'clean_dataset.txt')
-# create_train_test_datasets(clean_dataset_filename)
+# create_train_test_datasets()
diff --git a/train_models.py b/train_models.py
@@ -1,22 +1,20 @@
-import os
 import pickle
 from yaml_tools import read_yaml_from_file
 from config.ml_models import classifiers
+from find_filename import find_dataset_filename
+from find_filename import find_hyperparams_filename
+from find_filename import find_model_filename
 
 
 def train_model(ml_model, method):
-    train_data_file = os.path.join(os.path.dirname(__file__),
-                                   'datasets', 'train',
-                                   f'{method}_train_dataset.txt')
-    hyperparams_file = os.path.join(os.path.dirname(__file__),
-                                    'config', 'hyperparams',
-                                    f'{method}_{ml_model}')
-    with open(train_data_file, 'rb') as f:
-        method_x_train, method_y_train = pickle.load(f)
-        hyperparams = read_yaml_from_file(hyperparams_file)
-        current_classifier = classifiers[ml_model]
-        clf = current_classifier(**hyperparams)
-        clf.fit(method_x_train, method_y_train)
-
-
-# print(train_model(ml_models[1], dataset_types[0]))
+    train_data_filename = find_dataset_filename('train', method=method)
+    hyperparams_file = find_hyperparams_filename(method, ml_model)
+    with open(train_data_filename, 'rb') as train_data_file:
+        x_train, y_train = pickle.load(train_data_file)
+    hyperparams = read_yaml_from_file(hyperparams_file)
+    current_classifier = classifiers[ml_model]
+    clf = current_classifier(**hyperparams)
+    clf.fit(x_train, y_train)
+    trained_model_filename = find_model_filename(method, ml_model)
+    with open(trained_model_filename, 'wb') as trained_model_file:
+        pickle.dump(clf, trained_model_file)