diff --git a/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt b/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt deleted file mode 100644 index 7e9e028..0000000 Binary files a/DatasetsBeforeProcessing/dataset_without_repetition_return_ncells.txt and /dev/null differ diff --git a/create_clean_dataset.py b/create_clean_dataset.py index 4651870..0e73222 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -53,7 +53,7 @@ def cleaning_dataset(): clean_dataset['names'], clean_dataset['features'] = \ remove_notunique_features(my_dataset['names'], my_dataset['features']) - print("features in normal", len(my_dataset['features'][0])) + print("features in biased", len(my_dataset['features'][0])) unique_features_filename = find_other_filename("unique_features") with open(unique_features_filename, 'wb') as unique_features_file: pickle.dump(clean_dataset['names'], unique_features_file) diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt index 5242697..823e6a4 100644 Binary files a/datasets/clean_dataset.txt and b/datasets/clean_dataset.txt differ diff --git a/find_filename.py b/find_filename.py index aa6a9ac..7b780db 100644 --- a/find_filename.py +++ b/find_filename.py @@ -19,7 +19,7 @@ def find_dataset_filename(purpose, dataset_quality=None, paradigm=''): if purpose == "unclean": return os.path.join(os.path.dirname(__file__), 'DatasetsBeforeProcessing', - 'dataset_without_repetition_return_ncells.txt') + 'dataset_without_repetition_return_ncells_with_subdir.txt') # 'dataset_with_repetition_return_ncells.txt') # for returning "repeated" instances # those with the same number of cells for all projections @@ -49,6 +49,6 @@ def find_other_filename(search): f'{search}.txt') -def find_timings_lists(model): +def find_all_info(model_name, paradigm, training_quality): return os.path.join(os.path.dirname(__file__), 'results', - 'timings_lists', f'{model}.txt') + 'all_info', f'{model_name}-{paradigm}-{training_quality}.txt') diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index bf1cf44..688858d 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -57,15 +57,17 @@ def extract_features(dataset): all_original_polynomials = [] all_projections = [] all_cells = [] - for index, projections in enumerate(dataset[0]): + all_subdirs = [] + for index, projections in enumerate(dataset['projections']): all_projections.append(projections) original_polynomials = projections[0][0] # the original polynomials are the initial polynomials of any # of the possible projections (also of the first one) all_original_polynomials.append(original_polynomials) - all_labels.append(dataset[1][index]) - all_timings.append(dataset[2][index]) - all_cells.append(dataset[4][index]) + all_labels.append(dataset['targets'][index]) + all_timings.append(dataset['timings'][index]) + all_cells.append(dataset['ncells'][index]) + all_subdirs.append(dataset['subdirs'][index]) names, instance_features = \ features_from_set_of_polys(original_polynomials) all_features.append(instance_features) @@ -76,7 +78,10 @@ def extract_features(dataset): my_dataset['timings'] = np.array(all_timings) my_dataset['projections'] = all_projections my_dataset['cells'] = np.array(all_cells) + my_dataset['subdir'] = np.array(all_subdirs) # all these use to be converted to np.array() + # Modify this so that smaller changes are done to my_dataset, + # because it is almost the same as dataset return my_dataset diff --git a/run_for_paper.py b/run_for_paper.py index 6af4d39..49fc59f 100644 --- a/run_for_paper.py +++ b/run_for_paper.py @@ -15,30 +15,31 @@ from train_models import train_model from main_heuristics import ordering_choices_heuristics from find_filename import find_dataset_filename -from find_filename import find_timings_lists +# from find_filename import find_timings_lists from find_filename import find_hyperparams_filename +from find_filename import find_all_info from test_models import compute_metrics from test_models import choose_indices -def metrics_for_all_reps(all_indices_chosen, testing_dataset, ml_model): - all_metrics = [compute_metrics(chosen_indices, testing_dataset) - for chosen_indices in all_indices_chosen] - aveg_metrics = {key: sum(metrics[key]/len(all_metrics) - for metrics in all_metrics) - for key in all_metrics[0]} - all_timings = testing_dataset['timings'] - aveg_timings = [] - for instance in range(len(all_indices_chosen[0])): - instance_timings = [timings[indices_chosen[instance]] - for timings, indices_chosen - in zip(all_timings, all_indices_chosen)] - aveg_timings.append(instance_timings) - timings_lists_filename = find_timings_lists(ml_model) - with open(timings_lists_filename, 'wb') as timings_lists_file: - pickle.dump(aveg_timings, timings_lists_file) - all_total_times = [metrics['TotalTime'] for metrics in all_metrics] - return aveg_metrics, all_total_times +# def metrics_for_all_reps(all_indices_chosen, testing_dataset, ml_model): +# all_metrics = [compute_metrics(chosen_indices, testing_dataset) +# for chosen_indices in all_indices_chosen] +# aveg_metrics = {key: sum(metrics[key]/len(all_metrics) +# for metrics in all_metrics) +# for key in all_metrics[0]} +# all_timings = testing_dataset['timings'] +# aveg_timings = [] +# for instance in range(len(all_indices_chosen[0])): +# instance_timings = [timings[indices_chosen[instance]] +# for timings, indices_chosen +# in zip(all_timings, all_indices_chosen)] +# aveg_timings.append(instance_timings) +# timings_lists_filename = find_timings_lists(ml_model) +# with open(timings_lists_filename, 'wb') as timings_lists_file: +# pickle.dump(aveg_timings, timings_lists_file) +# all_total_times = [metrics['TotalTime'] for metrics in all_metrics] +# return aveg_metrics, all_total_times def dominiks_plots(all_total_times): @@ -84,10 +85,10 @@ def study_a_model(model_name: str, testing_filename = find_dataset_filename('Test', testing_quality) with open(testing_filename, 'rb') as testing_file: testing_dataset = pickle.load(testing_file) - factorial_nvar = len(testing_dataset['projections'][0]) - if testing_quality in ['Biased', 'Balanced']: + if testing_quality == 'Biased': # If the dataset contains less factorial_nvar less instances, # we repeat each instance factorial_nvar times + factorial_nvar = len(testing_dataset['projections'][0]) testing_dataset = \ repeat_instances_dataset(testing_dataset, factorial_nvar) all_metrics = [] @@ -126,14 +127,17 @@ def study_a_model(model_name: str, model_info['All' + key] = [metrics[key] for metrics in all_metrics] # info of all metrics saved for seaborn boxplots + all_info_filename = find_all_info(model_name, paradigm, training_quality) + with open(all_info_filename, 'wb') as all_info_file: + pickle.dump(model_info, all_info_file) return model_info if __name__ == "__main__": - reps = 50 + reps = 1 data = dict() data['TotalTime'] = [] - new_datasets = False + new_datasets = True if new_datasets: cleaning_dataset() create_train_test_datasets() @@ -153,7 +157,7 @@ def study_a_model(model_name: str, paradigm = '' elif model_name in regressors: paradigm = 'Regression' - + print(model_name) model_info = study_a_model(model_name=model_name, testing_quality=testing_quality, paradigm=paradigm, diff --git a/test_train_datasets.py b/test_train_datasets.py index cdc7c3c..6a666ec 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -17,6 +17,7 @@ from find_filename import find_dataset_filename from find_filename import find_other_filename from math import log +from sklearn.model_selection import GroupShuffleSplit def count_instances(my_dataset, instance): @@ -40,23 +41,32 @@ def create_train_test_datasets(): # train and test sets are created random_state = 0 print(dataset.keys()) - datasets['Train_Biased']['features'], \ - datasets['Test_Biased']['features'], \ - datasets['Train_Biased']['labels'], \ - datasets['Test_Biased']['labels'], \ - datasets['Train_Biased']['timings'], \ - datasets['Test_Biased']['timings'], \ - datasets['Train_Biased']['projections'], \ - datasets['Test_Biased']['projections'], \ - datasets['Train_Biased']['cells'], \ - datasets['Test_Biased']['cells'] = \ - train_test_split(dataset['features'], - dataset['labels'], - dataset['timings'], - dataset['projections'], - dataset['cells'], - test_size=0.20, - random_state=random_state) + train_inds, test_inds = my_train_test_split(dataset['subdir']) + for key in dataset: + if key != 'names': + datasets['Train_Biased'][key] = \ + [dataset[key][i] for i in train_inds] + datasets['Test_Biased'][key] = \ + [dataset[key][i] for i in test_inds] + + # datasets['Train_Biased']['features'], \ + # datasets['Test_Biased']['features'], \ + # datasets['Train_Biased']['labels'], \ + # datasets['Test_Biased']['labels'], \ + # datasets['Train_Biased']['timings'], \ + # datasets['Test_Biased']['timings'], \ + # datasets['Train_Biased']['projections'], \ + # datasets['Test_Biased']['projections'], \ + # datasets['Train_Biased']['cells'], \ + # datasets['Test_Biased']['cells'] = \ + # train_test_split( + # dataset['features'], + # dataset['labels'], + # dataset['timings'], + # dataset['projections'], + # dataset['cells'], + # test_size=0.20, + # random_state=random_state) keys = ['features', 'timings', 'cells'] for purpose in purposes: datasets[f'{purpose}_Balanced'] = \ @@ -84,9 +94,8 @@ def create_train_test_datasets(): pickle.dump(datasets[purpose + '_' + quality], this_dataset_file) - - ## The following code is to count how many instances of each are there in the different datasets - ## Sould be substitute by another function + # The following code is to count how many instances of each are there in the different datasets + # Sould be substitute by another function # {datasets[f'{purpose}_balanced'][key]: elem for elem in balance_dataset(datasets[f'{purpose}_balanced'][key2] for key2 in keys) for key in keys} # x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) @@ -137,6 +146,13 @@ def create_regression_datasets(dataset_quality='Augmented', # in regression_dataset['timings']] +def my_train_test_split(groups): + splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state=7) + split = splitter.split(groups, groups=groups) + train_inds, test_inds = next(split) + return train_inds, test_inds + + # create_regression_datasets(taking_logarithms=False) -# create_train_test_datasets() \ No newline at end of file +# create_train_test_datasets() diff --git a/train_models.py b/train_models.py index 8d6df33..0ffb47b 100644 --- a/train_models.py +++ b/train_models.py @@ -17,7 +17,6 @@ def train_model(model_name, paradigm, training_quality): train_data_filename = find_dataset_filename('Train', dataset_quality=training_quality, paradigm=paradigm) - print(model_name, 'dataset used for train', train_data_filename) hyperparams_file = find_hyperparams_filename(model_name, paradigm=paradigm, training_quality=training_quality) with open(train_data_filename, 'rb') as train_data_file: train_dataset = pickle.load(train_data_file)