diff --git a/create_clean_dataset.py b/create_clean_dataset.py index d9786a1..4d60cc6 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -12,6 +12,7 @@ from packages.dataset_manipulation import remove_notunique_features from from_poly_set_to_features import poly_set_feature_extractor from find_filename import find_dataset_filename +from find_filename import find_other_filename def create_dataframe(dataset): @@ -44,21 +45,26 @@ def cleaning_dataset(): clean_dataset_filename = find_dataset_filename('clean') with open(dataset_filename, 'rb') as f: dataset = pickle.load(f) - original_polys_list, names, features_list, targets_list, timings_list =\ - extract_features(dataset) - # working with raw features - features = np.array(features_list) - unique_names, unique_features = remove_notunique_features(names, features) - - targets = np.array(targets_list) - timings = np.array([[convert_to_timing(timings_ordering) - for timings_ordering in timings_problem] - for timings_problem in timings_list]) - original_polys = np.array(original_polys_list) + my_dataset = extract_features(dataset) + clean_dataset = dict() + # # working with raw features + # features = np.array(features_list) + clean_dataset['names'], clean_dataset['features'] = \ + remove_notunique_features(my_dataset['names'], + my_dataset['features']) + unique_features_filename = find_other_filename("unique_features") + with open(unique_features_filename, 'wb') as unique_features_file: + pickle.dump(clean_dataset['names'], unique_features_file) + # Some timings are expressed as "Over 30", this is changed here + clean_dataset['timings'] = \ + np.array([[convert_to_timing(timings_ordering) + for timings_ordering in timings_problem] + for timings_problem in my_dataset['timings']]) + for key in my_dataset: + if key not in clean_dataset: + clean_dataset[key] = my_dataset[key] with open(clean_dataset_filename, 'wb') as clean_dataset_file: - dataset = pickle.dump((original_polys, unique_names, - unique_features, targets, timings), - clean_dataset_file) + pickle.dump(clean_dataset, clean_dataset_file) # dataset_filename = os.path.join(os.path.dirname(__file__), # 'DatasetsBeforeProcessing', diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt index 353a1c0..24c1b18 100644 Binary files a/datasets/clean_dataset.txt and b/datasets/clean_dataset.txt differ diff --git a/datasets/dataset_instances.csv b/datasets/dataset_instances.csv index 35fc40f..036e7e4 100644 --- a/datasets/dataset_instances.csv +++ b/datasets/dataset_instances.csv @@ -1,7 +1,7 @@ dataset,zero,one,two,three,four,five,total train normal dataset,326,74,105,41,163,106,815 -train balanced dataset,125,140,134,121,136,159,815 +train balanced dataset,137,123,145,150,122,138,815 train augmented dataset,815,815,815,815,815,815,4890 test normal dataset,80,19,30,10,39,26,204 -test balanced dataset,30,38,36,36,30,34,204 +test balanced dataset,30,30,41,35,36,32,204 test augmented dataset,204,204,204,204,204,204,1224 diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index 282d8a5..e1ab3ef 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index ecf68b1..1712401 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt index 12b7375..72b027d 100644 Binary files a/datasets/test/normal_test_dataset.txt and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index 0211cb0..3ed1c22 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 1ae54c9..0b42a35 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt index d6c5dcc..21b4a44 100644 Binary files a/datasets/train/normal_train_dataset.txt and b/datasets/train/normal_train_dataset.txt differ diff --git a/main.py b/main.py index f861665..fb90487 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,7 @@ Mathematical Software, ICMS 2020. ICMS 2020. Lecture Notes in Computer Science, vol 12097. Springer, Cham. https://doi.org/10.1007/978-3-030-52200-1_30 """ +import csv from config.ml_models import ml_models from config.ml_models import dataset_types from find_filename import find_dataset_filename @@ -29,7 +30,7 @@ # if tune_hyperparameters is used to decide whether to tune them # or to used previously tuned # tune_hyperparameters = False - +paradigm = 'classification' # cleaning_dataset() # create_train_test_datasets() @@ -44,31 +45,44 @@ # for method in dataset_types: # print(f"for {method}") # train_model(ml_model, method) -# for training_method in dataset_types: -# print(f"Testing models trained in {training_method}") -# test_results(training_method) - -timings = dict() +training_method = 'augmented' testing_method = 'augmented' -test_dataset_filename = find_dataset_filename('test', - testing_method) - -with open("classification_output_timings.csv", 'w') as f: - f.write("model, Normal, Balanced, Augmented\n") +first_time = 1 +output_file = "classification_output_acc_time.csv" for ml_model in ml_models: - for training_method in dataset_types: - trained_model_filename = find_model_filename(training_method, - ml_model) - accuracy = test_model(trained_model_filename, - test_dataset_filename) - timings[training_method] = timings_in_test(ml_model, testing_method, - training_method) - total_time = sum(timings[training_method]) - # with open("classification_output_acc_time.csv", 'a') as f: - # f.write(f"{ml_model}, {accuracy}, {total_time}\n") - with open("classification_output_timings.csv", 'a') as f: - f.write(f"{ml_model}, {sum(timings['Normal'])}, {sum(timings['Balanced'])}, {sum(timings['Augmented'])}\n") - timings['optimal'] = timings_in_test('optimal', testing_method) - print(sum(timings['optimal'])) - from make_plots import survival_plot - survival_plot(timings, plot_name=f"survival_plot_{ml_model}") + print(f"Testing models trained in {training_method}") + metrics = test_model(ml_model, paradigm=training_method, testing_method=testing_method) + if first_time == 1: + first_time = 0 + keys = list(metrics.keys()) + with open(output_file, 'a') as f: + f.write(', '.join(['Model'] + keys) + '\n') + with open(output_file, 'a', newline='') as f: + writer = csv.writer(f) + writer.writerow([ml_model] + [metrics[key] for key in keys]) + + +# timings = dict() +# testing_method = 'augmented' +# test_dataset_filename = find_dataset_filename('test', +# testing_method) + +# with open("classification_output_timings.csv", 'w') as f: +# f.write("model, Normal, Balanced, Augmented\n") +# for ml_model in ml_models: +# for training_method in dataset_types: +# trained_model_filename = find_model_filename(training_method, +# ml_model) +# accuracy = test_model(trained_model_filename, +# test_dataset_filename) +# timings[training_method] = timings_in_test(ml_model, testing_method, +# training_method) +# total_time = sum(timings[training_method]) +# # with open("classification_output_acc_time.csv", 'a') as f: +# # f.write(f"{ml_model}, {accuracy}, {total_time}\n") +# with open("classification_output_timings.csv", 'a') as f: +# f.write(f"{ml_model}, {sum(timings['Normal'])}, {sum(timings['Balanced'])}, {sum(timings['Augmented'])}\n") +# timings['optimal'] = timings_in_test('optimal', testing_method) +# print(sum(timings['optimal'])) +# from make_plots import survival_plot +# survival_plot(timings, plot_name=f"survival_plot_{ml_model}") diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index 945fd3b..c27c89c 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -1,6 +1,10 @@ -"""IS THIS BEING USED?""" +""" +IS THIS BEING USED? +YES, IT IS! +""" + import itertools -from xml.sax.handler import all_features +# from xml.sax.handler import all_features import numpy as np @@ -9,7 +13,8 @@ def aveg(given_list): def aveg_not_zero(given_list): - return sum(given_list)/max(1,len([1 for elem in given_list if elem!=0])) + return sum(given_list)/max(1, len([1 for elem in given_list + if elem != 0])) def identity(input): @@ -30,34 +35,57 @@ def sign(input): raise Exception("How is this possible?") -def create_features(degrees, variable=0, sv=False): - functions = [sum, max, aveg, aveg_not_zero] +def create_features(degrees, variable=0, sv=False, + include_aveg_not_zero=False): + if include_aveg_not_zero: + functions = [sum, max, aveg, aveg_not_zero] + else: + functions = [sum, max, aveg] # , aveg_not_zero] sign_or_not = [identity, sign] features = [] features_names = [] - for choice in itertools.product(functions, sign_or_not, functions, sign_or_not): - feature_description = choice[0].__name__+"sign"*(choice[1].__name__=="sign")+"_in_polys_"+choice[2].__name__+"_"+"sign"*(choice[3].__name__=="sign")+"of_" + "sum_of_"*sv+"degrees_of_var_"+str(variable)+"_in_monomials" - feature_value = choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) for degrees_in_poly in degrees])) + for choice in itertools.product(functions, + sign_or_not, functions, + sign_or_not): + feature_description = (choice[0].__name__ + + "sign" * (choice[1].__name__ == "sign") + + "_in_polys_" + choice[2].__name__ + "_" + + "sign" * (choice[3].__name__ == "sign") + + "of_" + "sum_of_" * sv + "degrees_of_var_" + + str(variable) + "_in_monomials") + feature_value = \ + choice[0](choice[1]([choice[2](choice[3](degrees_in_poly)) + for degrees_in_poly in degrees])) features.append(feature_value) features_names.append(feature_description) return features, features_names def extract_features(dataset): + my_dataset = dict() all_features = [] all_targets = [] all_timings = [] all_original_polynomials = [] - for index, all_projections in enumerate(dataset[0]): - original_polynomials = all_projections[0][0] + all_projections = [] + for index, projections in enumerate(dataset[0]): + all_projections.append(projections) + original_polynomials = projections[0][0] # the original polynomials are the initial polynomials of any # of the possible projections (also of the first one) all_original_polynomials.append(original_polynomials) all_targets.append(dataset[1][index]) all_timings.append(dataset[2][index]) - names, instance_features = features_from_set_of_polys(original_polynomials) + names, instance_features = features_from_set_of_polys( + original_polynomials) all_features.append(instance_features) - return np.array(all_original_polynomials), np.array(names), np.array(all_features), np.array(all_targets), np.array(all_timings) + my_dataset['polynomials'] = np.array(all_original_polynomials) + my_dataset['names'] = np.array(names) + my_dataset['features'] = np.array(all_features) + my_dataset['targets'] = np.array(all_targets) + my_dataset['timings'] = np.array(all_timings) + my_dataset['projections'] = np.array(all_projections) + return my_dataset def features_from_set_of_polys(original_polynomials): @@ -71,8 +99,12 @@ def features_from_set_of_polys(original_polynomials): variable=var) instance_features += var_features names += var_features_names - sdegrees = [[sum(monomial) for monomial in poly if monomial[var]!=0]+[0] for poly in original_polynomials] - svar_features, svar_features_names = create_features(sdegrees, variable=var, sv=True) + sdegrees = \ + [[sum(monomial) for monomial in poly if monomial[var] != 0] + [0] + for poly in original_polynomials] + svar_features, svar_features_names = create_features(sdegrees, + variable=var, + sv=True) instance_features += svar_features names += svar_features_names return names, instance_features diff --git a/test_train_datasets.py b/test_train_datasets.py index 20e2df2..ed1b73c 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -23,23 +23,29 @@ def count_instances(my_dataset, instance): def create_train_test_datasets(): clean_dataset_filename = find_dataset_filename('clean') with open(clean_dataset_filename, 'rb') as clean_dataset_file: - _, names, features, targets, timings = pickle.load(clean_dataset_file) - unique_names, unique_features = remove_notunique_features(names, features) - # features were already unique because of create_clean_dataset - # decide where to remove the features - print("create_train_test", timings) - unique_features_filename = find_other_filename("unique_features") - with open(unique_features_filename, 'wb') as unique_features_file: - pickle.dump(unique_features_filename, unique_features_file) + dataset = pickle.load(clean_dataset_file) + + ### + # Instead of creating dictionaries for features, labels,...abs + # maybe it's better to create a dictionary for each dataset: + # train/test, normal/balanced/augmented + ### x = dict() # to keep the features y = dict() # to keep the labels t = dict() # to keep the timings + p = dict() # to keep the projections # train and test sets are created random_state = 0 - x['train_normal'], x['test_normal'], y['train_normal'], y['test_normal'], t['train_normal'], t['test_normal'] = train_test_split(unique_features, targets, timings, - test_size=0.20, - random_state=random_state) - + x['train_normal'], x['test_normal'], \ + y['train_normal'], y['test_normal'], \ + t['train_normal'], t['test_normal'], \ + p['train_normal'], p['test_normal'] = \ + train_test_split(dataset['features'], + dataset['targets'], + dataset['timings'], + dataset['projections'], + test_size=0.20, + random_state=random_state) for purpose in ['train', 'test']: x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) @@ -51,7 +57,10 @@ def create_train_test_datasets(): for method in ['normal', 'balanced', 'augmented']: this_dataset_filename = find_dataset_filename(usage, method=method) with open(this_dataset_filename, 'wb') as this_dataset_file: - pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file) + if method == 'normal': + pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}'], p[f'{usage}_{method}']), this_dataset_file) + else: + pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file) writer.writerow([f'{usage} {method} dataset'] + [str(count_instances(y[f'{usage}_{method}'], i)) @@ -59,14 +68,17 @@ def create_train_test_datasets(): + [str(len(y[f'{usage}_{method}']))]) -def create_regression_datasets(): +def create_regression_datasets(taking_logarithms=True): for usage in ['train', 'test']: this_dataset_filename = find_dataset_filename(usage, method='augmented') # we will use the augmented dataset here with open(this_dataset_filename, 'rb') as this_dataset_file: X, Y, T = pickle.load(this_dataset_file) - Y = [log(timings[0]) for timings in T] # remove log here if real times want to be given + if taking_logarithms: + Y = [log(timings[0]) for timings in T] + else: + Y = [timings[0] for timings in T] this_dataset_filename =\ find_dataset_filename(usage, method='regression') with open(this_dataset_filename, 'wb') as this_dataset_file: diff --git a/train_models.py b/train_models.py index ad32ca6..84a2921 100644 --- a/train_models.py +++ b/train_models.py @@ -14,7 +14,12 @@ def train_model(ml_model, method): train_data_filename = find_dataset_filename('train', method=method) hyperparams_file = find_hyperparams_filename(method, ml_model) with open(train_data_filename, 'rb') as train_data_file: - x_train, y_train, _ = pickle.load(train_data_file) + if method == "Normal": + x_train, y_train, _, _ = pickle.load(train_data_file) + else: + x_train, y_train, _ = pickle.load(train_data_file) + # a = pickle.load(train_data_file) + # print(a[0], type(a), len(a), method) hyperparams = read_yaml_from_file(hyperparams_file) current_classifier = sklearn_models[ml_model] clf = current_classifier(**hyperparams)