Skip to content

Commit

Permalink
Test-train split done without messing with the folders
Browse files Browse the repository at this point in the history
Tereso del Rio committed Sep 23, 2023
1 parent cc21e87 commit c33c4a6
Showing 8 changed files with 78 additions and 54 deletions.
Binary file not shown.
2 changes: 1 addition & 1 deletion create_clean_dataset.py
Original file line number Diff line number Diff line change
@@ -53,7 +53,7 @@ def cleaning_dataset():
clean_dataset['names'], clean_dataset['features'] = \
remove_notunique_features(my_dataset['names'],
my_dataset['features'])
print("features in normal", len(my_dataset['features'][0]))
print("features in biased", len(my_dataset['features'][0]))
unique_features_filename = find_other_filename("unique_features")
with open(unique_features_filename, 'wb') as unique_features_file:
pickle.dump(clean_dataset['names'], unique_features_file)
Binary file modified datasets/clean_dataset.txt
Binary file not shown.
6 changes: 3 additions & 3 deletions find_filename.py
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@ def find_dataset_filename(purpose, dataset_quality=None, paradigm=''):
if purpose == "unclean":
return os.path.join(os.path.dirname(__file__),
'DatasetsBeforeProcessing',
'dataset_without_repetition_return_ncells.txt')
'dataset_without_repetition_return_ncells_with_subdir.txt')
# 'dataset_with_repetition_return_ncells.txt')
# for returning "repeated" instances
# those with the same number of cells for all projections
@@ -49,6 +49,6 @@ def find_other_filename(search):
f'{search}.txt')


def find_timings_lists(model):
def find_all_info(model_name, paradigm, training_quality):
return os.path.join(os.path.dirname(__file__), 'results',
'timings_lists', f'{model}.txt')
'all_info', f'{model_name}-{paradigm}-{training_quality}.txt')
13 changes: 9 additions & 4 deletions replicating_Dorians_features.py
Original file line number Diff line number Diff line change
@@ -57,15 +57,17 @@ def extract_features(dataset):
all_original_polynomials = []
all_projections = []
all_cells = []
for index, projections in enumerate(dataset[0]):
all_subdirs = []
for index, projections in enumerate(dataset['projections']):
all_projections.append(projections)
original_polynomials = projections[0][0]
# the original polynomials are the initial polynomials of any
# of the possible projections (also of the first one)
all_original_polynomials.append(original_polynomials)
all_labels.append(dataset[1][index])
all_timings.append(dataset[2][index])
all_cells.append(dataset[4][index])
all_labels.append(dataset['targets'][index])
all_timings.append(dataset['timings'][index])
all_cells.append(dataset['ncells'][index])
all_subdirs.append(dataset['subdirs'][index])
names, instance_features = \
features_from_set_of_polys(original_polynomials)
all_features.append(instance_features)
@@ -76,7 +78,10 @@ def extract_features(dataset):
my_dataset['timings'] = np.array(all_timings)
my_dataset['projections'] = all_projections
my_dataset['cells'] = np.array(all_cells)
my_dataset['subdir'] = np.array(all_subdirs)
# all these use to be converted to np.array()
# Modify this so that smaller changes are done to my_dataset,
# because it is almost the same as dataset
return my_dataset


52 changes: 28 additions & 24 deletions run_for_paper.py
Original file line number Diff line number Diff line change
@@ -15,30 +15,31 @@
from train_models import train_model
from main_heuristics import ordering_choices_heuristics
from find_filename import find_dataset_filename
from find_filename import find_timings_lists
# from find_filename import find_timings_lists
from find_filename import find_hyperparams_filename
from find_filename import find_all_info
from test_models import compute_metrics
from test_models import choose_indices


def metrics_for_all_reps(all_indices_chosen, testing_dataset, ml_model):
all_metrics = [compute_metrics(chosen_indices, testing_dataset)
for chosen_indices in all_indices_chosen]
aveg_metrics = {key: sum(metrics[key]/len(all_metrics)
for metrics in all_metrics)
for key in all_metrics[0]}
all_timings = testing_dataset['timings']
aveg_timings = []
for instance in range(len(all_indices_chosen[0])):
instance_timings = [timings[indices_chosen[instance]]
for timings, indices_chosen
in zip(all_timings, all_indices_chosen)]
aveg_timings.append(instance_timings)
timings_lists_filename = find_timings_lists(ml_model)
with open(timings_lists_filename, 'wb') as timings_lists_file:
pickle.dump(aveg_timings, timings_lists_file)
all_total_times = [metrics['TotalTime'] for metrics in all_metrics]
return aveg_metrics, all_total_times
# def metrics_for_all_reps(all_indices_chosen, testing_dataset, ml_model):
# all_metrics = [compute_metrics(chosen_indices, testing_dataset)
# for chosen_indices in all_indices_chosen]
# aveg_metrics = {key: sum(metrics[key]/len(all_metrics)
# for metrics in all_metrics)
# for key in all_metrics[0]}
# all_timings = testing_dataset['timings']
# aveg_timings = []
# for instance in range(len(all_indices_chosen[0])):
# instance_timings = [timings[indices_chosen[instance]]
# for timings, indices_chosen
# in zip(all_timings, all_indices_chosen)]
# aveg_timings.append(instance_timings)
# timings_lists_filename = find_timings_lists(ml_model)
# with open(timings_lists_filename, 'wb') as timings_lists_file:
# pickle.dump(aveg_timings, timings_lists_file)
# all_total_times = [metrics['TotalTime'] for metrics in all_metrics]
# return aveg_metrics, all_total_times


def dominiks_plots(all_total_times):
@@ -84,10 +85,10 @@ def study_a_model(model_name: str,
testing_filename = find_dataset_filename('Test', testing_quality)
with open(testing_filename, 'rb') as testing_file:
testing_dataset = pickle.load(testing_file)
factorial_nvar = len(testing_dataset['projections'][0])
if testing_quality in ['Biased', 'Balanced']:
if testing_quality == 'Biased':
# If the dataset contains less factorial_nvar less instances,
# we repeat each instance factorial_nvar times
factorial_nvar = len(testing_dataset['projections'][0])
testing_dataset = \
repeat_instances_dataset(testing_dataset, factorial_nvar)
all_metrics = []
@@ -126,14 +127,17 @@ def study_a_model(model_name: str,
model_info['All' + key] = [metrics[key]
for metrics in all_metrics]
# info of all metrics saved for seaborn boxplots
all_info_filename = find_all_info(model_name, paradigm, training_quality)
with open(all_info_filename, 'wb') as all_info_file:
pickle.dump(model_info, all_info_file)
return model_info


if __name__ == "__main__":
reps = 50
reps = 1
data = dict()
data['TotalTime'] = []
new_datasets = False
new_datasets = True
if new_datasets:
cleaning_dataset()
create_train_test_datasets()
@@ -153,7 +157,7 @@ def study_a_model(model_name: str,
paradigm = ''
elif model_name in regressors:
paradigm = 'Regression'

print(model_name)
model_info = study_a_model(model_name=model_name,
testing_quality=testing_quality,
paradigm=paradigm,
58 changes: 37 additions & 21 deletions test_train_datasets.py
Original file line number Diff line number Diff line change
@@ -17,6 +17,7 @@
from find_filename import find_dataset_filename
from find_filename import find_other_filename
from math import log
from sklearn.model_selection import GroupShuffleSplit


def count_instances(my_dataset, instance):
@@ -40,23 +41,32 @@ def create_train_test_datasets():
# train and test sets are created
random_state = 0
print(dataset.keys())
datasets['Train_Biased']['features'], \
datasets['Test_Biased']['features'], \
datasets['Train_Biased']['labels'], \
datasets['Test_Biased']['labels'], \
datasets['Train_Biased']['timings'], \
datasets['Test_Biased']['timings'], \
datasets['Train_Biased']['projections'], \
datasets['Test_Biased']['projections'], \
datasets['Train_Biased']['cells'], \
datasets['Test_Biased']['cells'] = \
train_test_split(dataset['features'],
dataset['labels'],
dataset['timings'],
dataset['projections'],
dataset['cells'],
test_size=0.20,
random_state=random_state)
train_inds, test_inds = my_train_test_split(dataset['subdir'])
for key in dataset:
if key != 'names':
datasets['Train_Biased'][key] = \
[dataset[key][i] for i in train_inds]
datasets['Test_Biased'][key] = \
[dataset[key][i] for i in test_inds]

# datasets['Train_Biased']['features'], \
# datasets['Test_Biased']['features'], \
# datasets['Train_Biased']['labels'], \
# datasets['Test_Biased']['labels'], \
# datasets['Train_Biased']['timings'], \
# datasets['Test_Biased']['timings'], \
# datasets['Train_Biased']['projections'], \
# datasets['Test_Biased']['projections'], \
# datasets['Train_Biased']['cells'], \
# datasets['Test_Biased']['cells'] = \
# train_test_split(
# dataset['features'],
# dataset['labels'],
# dataset['timings'],
# dataset['projections'],
# dataset['cells'],
# test_size=0.20,
# random_state=random_state)
keys = ['features', 'timings', 'cells']
for purpose in purposes:
datasets[f'{purpose}_Balanced'] = \
@@ -84,9 +94,8 @@ def create_train_test_datasets():
pickle.dump(datasets[purpose + '_' + quality],
this_dataset_file)


## The following code is to count how many instances of each are there in the different datasets
## Sould be substitute by another function
# The following code is to count how many instances of each are there in the different datasets
# Sould be substitute by another function

# {datasets[f'{purpose}_balanced'][key]: elem for elem in balance_dataset(datasets[f'{purpose}_balanced'][key2] for key2 in keys) for key in keys}
# x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal'])
@@ -137,6 +146,13 @@ def create_regression_datasets(dataset_quality='Augmented',
# in regression_dataset['timings']]


def my_train_test_split(groups):
splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state=7)
split = splitter.split(groups, groups=groups)
train_inds, test_inds = next(split)
return train_inds, test_inds


# create_regression_datasets(taking_logarithms=False)

# create_train_test_datasets()
# create_train_test_datasets()
1 change: 0 additions & 1 deletion train_models.py
Original file line number Diff line number Diff line change
@@ -17,7 +17,6 @@

def train_model(model_name, paradigm, training_quality):
train_data_filename = find_dataset_filename('Train', dataset_quality=training_quality, paradigm=paradigm)
print(model_name, 'dataset used for train', train_data_filename)
hyperparams_file = find_hyperparams_filename(model_name, paradigm=paradigm, training_quality=training_quality)
with open(train_data_filename, 'rb') as train_data_file:
train_dataset = pickle.load(train_data_file)

0 comments on commit c33c4a6

Please sign in to comment.