Cleaning code

delriot · Apr 4, 2023 · 3300703 · 3300703
1 parent c37433d
commit 3300703
Showing 14 changed files with 119 additions and 104 deletions.
diff --git a/basic_ml.py b/basic_ml.py
@@ -1,3 +1,6 @@
+"""NOT IN USE"""
+
+
 """Contains a function to do some basic machine learning."""
 import numpy as np
 from tensorflow import keras

diff --git a/choose_hyperparams.py b/choose_hyperparams.py
@@ -17,20 +17,13 @@
 import os
 import pickle
 import csv
-import yaml
 import importlib.util
 from config.ml_models import ml_models
 from config.ml_models import classifiers
 from config.ml_models import dataset_types
 from config.hyperparameters_grid import grid
 from sklearn.model_selection import GridSearchCV
-
-
-def write_yaml_to_file(py_obj, filename):
-    with open(f'{filename}.yaml', 'w',) as f:
-        yaml.dump(py_obj, f, sort_keys=False)
-    print('Written to file successfully')
-
+from yaml_tools import read_yaml_from_file
 
 def k_folds_ml(x_train, y_train, model, random_state=0):
     """

diff --git a/create_clean_dataset.py b/create_clean_dataset.py
@@ -1,26 +1,41 @@
 import pickle
 import numpy as np
 from replicating_Dorians_features import extract_features
-from basic_ml import use_tf, basic_ml
-from itertools import product
 import sys
 import os
-import csv
+import importlib
+if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)):
+    from dataset_manipulation import name_unique_features
+    from dataset_manipulation import remove_notunique_features
+    from dataset_manipulation import balance_dataset
+    from dataset_manipulation import augmentate_dataset
+else:
+    from packages.dataset_manipulation import name_unique_features
+    from packages.dataset_manipulation import remove_notunique_features
+    from packages.dataset_manipulation import balance_dataset
+    from packages.dataset_manipulation import augmentate_dataset
 
 
-dataset_file = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
-f = open(dataset_file, 'rb')
-dataset = pickle.load(f)
-original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)
+dataset_filename = os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', 'dataset_without_repetition_return_ncells.txt')
+clean_dataset_filename = os.path.join(os.path.dirname(__file__),
+                                    'datasets',
+                                    'clean_dataset.txt')
 
-# working with raw features
-features = np.array(features_list)
-targets = np.array(targets_list)
-timings = np.array(timings_list)
-original_polys = np.array(original_polys_list)
 
-clean_dataset_file = os.path.join(os.path.dirname(__file__),
-                                  'datasets',
-                                  'clean_dataset.txt')
-g = open(clean_dataset_file, 'wb')
-dataset = pickle.dump((original_polys, names, features, targets, timings), g)
+def cleaning_dataset(dataset_filename, clean_dataset_filename):
+    with open(dataset_filename, 'rb') as f:
+        dataset = pickle.load(f)
+    original_polys_list, names, features_list, targets_list, timings_list = extract_features(dataset)
+
+    # working with raw features
+    features = np.array(features_list)
+    unique_names, unique_features = remove_notunique_features(names, features)
+
+    targets = np.array(targets_list)
+    timings = np.array(timings_list)
+    original_polys = np.array(original_polys_list)
+
+    with open(clean_dataset_filename, 'wb') as g:
+        dataset = pickle.dump((original_polys, unique_names, unique_features, targets, timings), g)
+
+cleaning_dataset(dataset_filename, clean_dataset_filename)
diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt
diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv
@@ -1,7 +1,7 @@
 dataset,zero,one,two,three,four,five,total
 train normal dataset,326,74,105,41,163,106,815
-train balanced dataset,126,113,149,138,144,145,815
+train balanced dataset,151,121,136,152,133,122,815
 train augmented dataset,815,815,815,815,815,815,4890
 test normal dataset,80,19,30,10,39,26,204
-test balanced dataset,31,34,32,38,34,35,204
+test balanced dataset,29,27,32,48,34,34,204
 test augmented dataset,204,204,204,204,204,204,1224

diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt
diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt
diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt
diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt
diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt
diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt
diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py
@@ -3,6 +3,7 @@
 import math
 import random
 from .exploit_symmetries import give_all_symmetries
+from sklearn.preprocessing import normalize
 
 nvar = 3
 
@@ -73,10 +74,24 @@ def name_unique_features(names, features):
     return new_names
 
 
-def remove_notunique_features(unique_names, names, features):
+def get_unique_feature_names(unique_names, names, features):
     """Return the features corresponding to a name in 'unique_names'."""
     unique_features = []
     for index, feature in enumerate(zip(*features)):
         if names[index] in unique_names:
             unique_features.append(feature)
     return np.transpose(unique_features)
+
+
+def remove_notunique_features(names, features):
+    # creating some targets and timing because the function requires them
+    targets = [0]*len(features)
+    timings = [[0,0]]*len(features)
+    augmented_features, _, _ = augmentate_dataset(features, targets, timings)
+    # normalized_augmented_features = normalize(augmented_features)
+    unique_names = name_unique_features(names, augmented_features)
+    unique_features = []
+    for index, feature in enumerate(zip(*features)):
+        if names[index] in unique_names:
+            unique_features.append(feature)
+    return unique_names, np.transpose(unique_features)
diff --git a/test_train_datasets.py b/test_train_datasets.py
@@ -36,65 +36,47 @@
 
 
 def count_instances(my_dataset, instance):
-    return sum(my_dataset==instance)
-
-
-names_features_targets_file = os.path.join(os.path.dirname(__file__),
-                                           'datasets',
-                                           'clean_dataset.txt')
-with open(names_features_targets_file, 'rb') as f:
-    original_polys, names, features, targets, timings = pickle.load(f)
-
-augmented_features, augmented_targets, augmented_timings = augmentate_dataset(features, targets, timings)
-
-normalized_augmented_features = normalize(augmented_features)
-unique_names = name_unique_features(names,
-                                    augmented_features)
-
-random_state = 0
-
-x = dict() # to keep the features
-y = dict() # to keep the labels
-t = dict() # to keep the timings
-# train and test sets are created
-not_unique_x_normal_train, not_unique_x_normal_test, y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(features, targets, timings,
-                                                                                           test_size=0.20,
-                                                                                           random_state=random_state)
-
-not_unique_balanced_x_test, y['test_balanced'], t['test_balanced'] = balance_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
-x['test_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_test)
-# testing data for all approaches is ready
-# all tests will be done in balanced but the others are also computed
-not_unique_augmented_x_test, y['test_augmented'], t['test_augmented'] = augmentate_dataset(not_unique_x_normal_test, y['test_normal'], t['test_normal'])
-x['test_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_test)
-x['test_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_test)
-
-x['train_normal'] = remove_notunique_features(unique_names, names, not_unique_x_normal_train)
-# normal training data ready
-not_unique_balanced_x_train, y['train_balanced'], t['train_balanced'] = balance_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
-x['train_balanced'] = remove_notunique_features(unique_names, names, not_unique_balanced_x_train)
-# balanced training data ready
-not_unique_augmented_x_train, y['train_augmented'], t['train_augmented'] = augmentate_dataset(not_unique_x_normal_train, y['train_normal'], t['train_normal'])
-x['train_augmented'] = remove_notunique_features(unique_names, names, not_unique_augmented_x_train)
-# augmented training data ready
-
-
-dataset_info_file = os.path.join(os.path.dirname(__file__),
-                                 'datasets',
-                                 'dataset_instances.csv')
-with open(dataset_info_file, 'w') as f_dataset_info:
-    writer = csv.writer(f_dataset_info)
-    writer.writerow(['dataset'] + ['zero','one','two','three','four','five','total'])
-    for usage in ['train', 'test']:
-        for method in ['normal', 'balanced', 'augmented']:
-            print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
-            this_dataset_file = os.path.join(os.path.dirname(__file__),
-                                            'datasets', usage,
-                                            f'{method}_{usage}_dataset.txt')
-            with open(this_dataset_file, 'wb') as f:
-                pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)
-
-            writer.writerow([f'{usage} {method} dataset']
-                            + [str(count_instances(y[f'{usage}_{method}'], i))
-                               for i in range(6)]
-                            + [str(len(y[f'{usage}_{method}']))])
+    return sum(my_dataset == instance)
+
+
+def create_train_test_datasets(clean_dataset_filename):
+    with open(clean_dataset_filename, 'rb') as clean_dataset_file:
+        _, names, features, targets, timings = pickle.load(clean_dataset_file)
+
+    x = dict()  # to keep the features
+    y = dict()  # to keep the labels
+    t = dict()  # to keep the timings
+    # train and test sets are created
+    x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings,
+                                                                                                                                    test_size=0.20,
+                                                                                                                                    random_state=random_state)
+    for purpose in ['train', 'test']:
+        x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
+        x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
+
+
+    dataset_info_file = os.path.join(os.path.dirname(__file__),
+                                     'datasets',
+                                     'dataset_instances.csv')
+    with open(dataset_info_file, 'w') as f_dataset_info:
+        writer = csv.writer(f_dataset_info)
+        writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total'])
+        for usage in ['train', 'test']:
+            for method in ['normal', 'balanced', 'augmented']:
+                print(f"y['{usage}_{method}'])", len(y[f'{usage}_{method}']))
+                this_dataset_file = os.path.join(os.path.dirname(__file__),
+                                                'datasets', usage,
+                                                f'{method}_{usage}_dataset.txt')
+                with open(this_dataset_file, 'wb') as f:
+                    pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}']), f)
+
+                writer.writerow([f'{usage} {method} dataset']
+                                + [str(count_instances(y[f'{usage}_{method}'], i))
+                                for i in range(6)]
+                                + [str(len(y[f'{usage}_{method}']))])
+
+
+# clean_dataset_filename = os.path.join(os.path.dirname(__file__),
+#                                            'datasets',
+#                                            'clean_dataset.txt')
+# create_train_test_datasets(clean_dataset_filename)
diff --git a/train_models.py b/train_models.py
@@ -1,15 +1,22 @@
-import yaml
-from yaml import UnsafeLoader
 import os
-from config.ml_models import ml_models
-from config.ml_models import dataset_types
-
-print(ml_models)
-for ml_model in ml_models:
-    for method in dataset_types:
-        filename = os.path.join(os.path.dirname(__file__),
-                                'config', 'hyperparams',
-                                f'{method}_{ml_model}.yaml')
-        with open(filename, 'r') as f:
-            hyperparameters = yaml.load(f, Loader=UnsafeLoader)
-            print(type(hyperparameters), hyperparameters)
+import pickle
+from yaml_tools import read_yaml_from_file
+from config.ml_models import classifiers
+
+
+def train_model(ml_model, method):
+    train_data_file = os.path.join(os.path.dirname(__file__),
+                                   'datasets', 'train',
+                                   f'{method}_train_dataset.txt')
+    hyperparams_file = os.path.join(os.path.dirname(__file__),
+                                    'config', 'hyperparams',
+                                    f'{method}_{ml_model}')
+    with open(train_data_file, 'rb') as f:
+        method_x_train, method_y_train = pickle.load(f)
+        hyperparams = read_yaml_from_file(hyperparams_file)
+        current_classifier = classifiers[ml_model]
+        clf = current_classifier(**hyperparams)
+        clf.fit(method_x_train, method_y_train)
+
+
+# print(train_model(ml_models[1], dataset_types[0]))