Skip to content
Navigation Menu
Toggle navigation
Sign in
In this repository
All GitHub Enterprise
↵
Jump to
↵
No suggested jump to results
In this repository
All GitHub Enterprise
↵
Jump to
↵
In this user
All GitHub Enterprise
↵
Jump to
↵
In this repository
All GitHub Enterprise
↵
Jump to
↵
Sign in
Reseting focus
You signed in with another tab or window.
Reload
to refresh your session.
You signed out in another tab or window.
Reload
to refresh your session.
You switched accounts on another tab or window.
Reload
to refresh your session.
Dismiss alert
{{ message }}
delriot
/
AugmentingMathematicalDataset
Public
Notifications
You must be signed in to change notification settings
Fork
0
Star
1
Code
Issues
0
Pull requests
0
Projects
0
Security
Insights
Additional navigation options
Code
Issues
Pull requests
Projects
Security
Insights
Files
main
.github
Heuristics
config
datasets
packages
utils
README.md
basic_ml.py
choose_hyperparams.py
create_clean_dataset.py
find_filename.py
main.py
main_heuristics.py
main_regression.py
main_reinforcement.py
make_plots.py
output.txt
preprocessing_Dorians_features.py
replicating_Dorians_features.py
requirements.txt
run_for_paper.py
test_models.py
test_train_datasets.py
train_models.py
yaml_tools.py
Breadcrumbs
AugmentingMathematicalDataset
/
run_for_paper.py
Blame
Blame
Latest commit
History
History
178 lines (159 loc) · 7.25 KB
Breadcrumbs
AugmentingMathematicalDataset
/
run_for_paper.py
Top
File metadata and controls
Code
Blame
178 lines (159 loc) · 7.25 KB
Raw
import os import pickle import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from create_clean_dataset import cleaning_dataset from test_train_datasets import create_train_test_datasets from test_train_datasets import create_regression_datasets from config.ml_models import all_models from config.ml_models import regressors from config.ml_models import classifiers from config.ml_models import heuristics from choose_hyperparams import choose_hyperparams from train_models import train_model from main_heuristics import ordering_choices_heuristics from find_filename import find_dataset_filename # from find_filename import find_timings_lists from find_filename import find_hyperparams_filename from find_filename import find_all_info from test_models import compute_metrics from test_models import choose_indices # def metrics_for_all_reps(all_indices_chosen, testing_dataset, ml_model): # all_metrics = [compute_metrics(chosen_indices, testing_dataset) # for chosen_indices in all_indices_chosen] # aveg_metrics = {key: sum(metrics[key]/len(all_metrics) # for metrics in all_metrics) # for key in all_metrics[0]} # all_timings = testing_dataset['timings'] # aveg_timings = [] # for instance in range(len(all_indices_chosen[0])): # instance_timings = [timings[indices_chosen[instance]] # for timings, indices_chosen # in zip(all_timings, all_indices_chosen)] # aveg_timings.append(instance_timings) # timings_lists_filename = find_timings_lists(ml_model) # with open(timings_lists_filename, 'wb') as timings_lists_file: # pickle.dump(aveg_timings, timings_lists_file) # all_total_times = [metrics['TotalTime'] for metrics in all_metrics] # return aveg_metrics, all_total_times def dominiks_plots(all_total_times): data = [] for key in all_total_times: data.extend([{'Model': key, 'Total time': total_time} for total_time in all_total_times[key]]) df = pd.DataFrame(data) # Create a box plot plt.figure(figsize=(8, 6)) sns.boxplot(x='Model', y='Total time', data=df) # Add labels and title plt.xlabel('Model') plt.ylabel('Total time') plt.title('Model Total time Comparison') # Display the plot plt.show() def repeat_instances_dataset(dataset, n_reps): new_dataset = dict() for key in dataset: new_dataset[key] = [elem for elem in dataset[key] for _ in range(n_reps)] return new_dataset def study_a_model(model_name: str, testing_quality: str, paradigm: str, training_quality: str = '', tune_hyperparameters: bool = False, reps: int = 10 ): if model_name in heuristics: if training_quality != '': raise Exception(f"training_quality cannot be {training_quality}.") if tune_hyperparameters is not False: raise Exception(f"Hyperparams cannot be tuned for {paradigm}.") testing_filename = find_dataset_filename('Test', testing_quality) with open(testing_filename, 'rb') as testing_file: testing_dataset = pickle.load(testing_file) if testing_quality == 'Biased': # If the dataset contains less factorial_nvar less instances, # we repeat each instance factorial_nvar times factorial_nvar = len(testing_dataset['projections'][0]) testing_dataset = \ repeat_instances_dataset(testing_dataset, factorial_nvar) all_metrics = [] all_timings = [] for _ in range(reps): if model_name not in heuristics: # If the paradigm is 'Heuristics' there is no need # to tune_hyperparameters or to train the models hyperparams_filename = find_hyperparams_filename(model_name, paradigm, training_quality) + '.yaml' if tune_hyperparameters or not os.path.exists(hyperparams_filename): if not os.path.exists(hyperparams_filename): print('hyperparams_filename doesnt exits \n', hyperparams_filename) choose_hyperparams(model_name, paradigm, training_quality) # Hyperparameters ready train_model(model_name, paradigm, training_quality) # Model trained chosen_indices = choose_indices(model_name, testing_dataset, paradigm, training_quality) # Indices chosen by the model all_metrics.append(compute_metrics(chosen_indices, testing_dataset)) all_timings.append([timings[index] for timings, index in zip(testing_dataset['timings'], chosen_indices)]) model_info = dict() model_info['AverageMetrics'] = {key: sum(metrics[key] for metrics in all_metrics)/reps for key in all_metrics[0]} # average metrics computed for comparison purposes model_info['AverageTimings'] = [sum(all_timings_in_instance)/reps for all_timings_in_instance in zip(*all_timings)] # average timings in each instance to create adversarial plots for key in all_metrics[0]: model_info['All' + key] = [metrics[key] for metrics in all_metrics] # info of all metrics saved for seaborn boxplots all_info_filename = find_all_info(model_name, paradigm, training_quality) with open(all_info_filename, 'wb') as all_info_file: pickle.dump(model_info, all_info_file) return model_info if __name__ == "__main__": reps = 1 data = dict() data['TotalTime'] = [] new_datasets = True if new_datasets: cleaning_dataset() create_train_test_datasets() create_regression_datasets() all_total_times = dict() for model_name in list(all_models) + heuristics: if model_name in heuristics: testing_quality = 'Biased' training_quality = '' tune_hyperparameters = False paradigm = 'Greedy' # NotGreedy else: testing_quality = 'Augmented' training_quality = 'Augmented' tune_hyperparameters = False if model_name in classifiers: paradigm = '' elif model_name in regressors: paradigm = 'Regression' print(model_name) model_info = study_a_model(model_name=model_name, testing_quality=testing_quality, paradigm=paradigm, training_quality=training_quality, tune_hyperparameters=tune_hyperparameters, reps=reps ) all_total_times[model_name] = model_info['AllTotalTime'] dominiks_plots(all_total_times) # def choose_indices(model, dataset): # if model in classifiers: # elif model in heuristics: # ordering_choices_heuristics(model, dataset)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
You can’t perform that action at this time.