Skip to content
Navigation Menu
Toggle navigation
Sign in
In this repository
All GitHub Enterprise
↵
Jump to
↵
No suggested jump to results
In this repository
All GitHub Enterprise
↵
Jump to
↵
In this user
All GitHub Enterprise
↵
Jump to
↵
In this repository
All GitHub Enterprise
↵
Jump to
↵
Sign in
Reseting focus
You signed in with another tab or window.
Reload
to refresh your session.
You signed out in another tab or window.
Reload
to refresh your session.
You switched accounts on another tab or window.
Reload
to refresh your session.
Dismiss alert
{{ message }}
delriot
/
AugmentingMathematicalDataset
Public
Notifications
You must be signed in to change notification settings
Fork
0
Star
1
Code
Issues
0
Pull requests
0
Projects
0
Security
Insights
Additional navigation options
Code
Issues
Pull requests
Projects
Security
Insights
Files
b1a0475
DatasetsBeforeProcessing
Heuristics
config
datasets
packages
README.md
basic_ml.py
choose_hyperparams.py
create_clean_dataset.py
find_filename.py
from_poly_set_to_features.py
main.py
main_heuristics.py
main_regression.py
make_plots.py
output.txt
preprocessing_Dorians_features.py
replicating_Dorians_features.py
test_models.py
test_train_datasets.py
train_models.py
yaml_tools.py
Breadcrumbs
AugmentingMathematicalDataset
/
create_clean_dataset.py
Blame
Blame
Latest commit
History
History
112 lines (95 loc) · 4.58 KB
Breadcrumbs
AugmentingMathematicalDataset
/
create_clean_dataset.py
Top
File metadata and controls
Code
Blame
112 lines (95 loc) · 4.58 KB
Raw
"""This file contains a function that given the raw dataset containing the sets of polynomials and its timings for each order, creates a dataset containing a set of unique features and its class""" import re import pickle import numpy as np from replicating_Dorians_features import extract_features import importlib if isinstance(importlib.util.find_spec('dataset_manipulation'), type(None)): from dataset_manipulation import remove_notunique_features else: from packages.dataset_manipulation import remove_notunique_features from from_poly_set_to_features import poly_set_feature_extractor from find_filename import find_dataset_filename from find_filename import find_other_filename def create_dataframe(dataset): all_features = [] all_labels = dataset[1][:] all_timings = dataset[2][:] all_original_polynomials = [] for index, all_projections in enumerate(dataset[0]): original_polynomials = all_projections[0][0] all_original_polynomials.append(original_polynomials) names, all_features =\ poly_set_feature_extractor(all_original_polynomials, determine_standarization=True, determine_unique_features=True) return np.array(all_original_polynomials), np.array(names),\ np.array(all_features), np.array(all_labels), np.array(all_timings) # dataset_filename = os.path.join(os.path.dirname(__file__), # 'DatasetsBeforeProcessing', # 'dataset_without_repetition_return_ncells.txt') # with open(dataset_filename, 'rb') as f: # dataset = pickle.load(f) # original_polys_list, names, features_list, labels_list, timings_list =\ # create_dataframe(dataset) def cleaning_dataset(): dataset_filename = find_dataset_filename('unclean') clean_dataset_filename = find_dataset_filename('clean') with open(dataset_filename, 'rb') as f: dataset = pickle.load(f) my_dataset = extract_features(dataset) clean_dataset = dict() # # working with raw features # features = np.array(features_list) clean_dataset['names'], clean_dataset['features'] = \ remove_notunique_features(my_dataset['names'], my_dataset['features']) print("features in normal", len(my_dataset['features'][0])) unique_features_filename = find_other_filename("unique_features") with open(unique_features_filename, 'wb') as unique_features_file: pickle.dump(clean_dataset['names'], unique_features_file) # Some timings are expressed as "Over 30", this is changed here clean_dataset['timings'] = \ np.array([[convert_to_timing(timings_ordering) for timings_ordering in timings_problem] for timings_problem in my_dataset['timings']]) # Some cells are expressed as "Over 30", this is changed here clean_dataset['cells'] = \ np.array([convert_to_cells(cells_problem) for cells_problem in my_dataset['cells']]) for key in my_dataset: if key not in clean_dataset: clean_dataset[key] = my_dataset[key] with open(clean_dataset_filename, 'wb') as clean_dataset_file: pickle.dump(clean_dataset, clean_dataset_file) # dataset_filename = os.path.join(os.path.dirname(__file__), # 'DatasetsBeforeProcessing', # 'dataset_without_repetition_return_ncells.txt') # clean_dataset_filename = os.path.join(os.path.dirname(__file__), # 'datasets', # 'clean_dataset.txt') # cleaning_dataset(dataset_filename, clean_dataset_filename) def convert_to_timing(timing_str, penalization=2): if not contains_float(timing_str): print(penalization * float(timing_str[5:])) return penalization * float(timing_str[5:]) return float(timing_str) def convert_to_cells(cells, penalization=2): int_cells = [int(cell) if contains_int(cell) else cell for cell in cells] max_cells = max([cell for cell in int_cells if type(cell) == int]) penalization_cells = [cell if type(cell) == int else penalization*max_cells for cell in int_cells] return penalization_cells def contains_float(input_str): float_pattern = r'^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$' match = re.search(float_pattern, input_str) return match is not None def contains_int(input_str): int_pattern = r'^[-+]?\d+$' match = re.match(int_pattern, input_str) return match is not None cleaning_dataset()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
You can’t perform that action at this time.