diff --git a/Heuristics/__init__.py b/Heuristics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Heuristics/create_graphs_heuristics.py b/Heuristics/create_graphs_heuristics.py new file mode 100644 index 0000000..0845cf4 --- /dev/null +++ b/Heuristics/create_graphs_heuristics.py @@ -0,0 +1,228 @@ +''' +This file contains the functions to create the graphs comparing the heuristics. +''' + +from pydoc_data import topics +import matplotlib.pyplot as plt +from matplotlib.pyplot import cm +import pickle +import os +import numpy as np +from numpy import sort +from numpy import Inf + +from .heuristics_guess import choose_order_given_projections +from .heuristic_tools import get_dataset, substract_two_timings, finding_time_limit, compute_markups, compute_real_timings + +import matplotlib +matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans' +matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic' +matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold' +matplotlib.rcParams['mathtext.fontset'] = 'cm' +matplotlib.rcParams['font.family'] = 'STIXGeneral' + +fontsize = 15 +desired_font = {'fontname':'monospace'} +matplotlib.rcParams.update({'font.size': fontsize}) + +folder_figures = os.path.join(os.path.dirname(__file__), '..','Art') + +###################################################### +###LEARN TO USE PNG https://riptutorial.com/matplotlib/example/10066/saving-and-exporting-plots-that-use-tex#:~:text=In%20order%20to%20include%20plots,text%20in%20the%20final%20document.&text=Plots%20in%20matplotlib%20can%20be,macro%20package%20to%20display%20graphics. +######################################################## + + +def create_survival_plot( + heuristics = ['virtual_best', 'gmods', 'mods', 'brown', 'sotd', 'greedy_sotd'], + minimum_time_to_consider=0, + rep=10 + ): + '''This function creates a survival plot comparing the desired heuristics.''' + + dataset = get_dataset(without_repetition=True, minimum_time_to_consider=minimum_time_to_consider) + projections, targets, timings, heuristics_costs, ncells = dataset + + color = cm.rainbow(np.linspace(0, 1, len(heuristics)+1)) + # color[4]=[0.8,0.8,0.2,1] + color[3]=[0.65,0.42,0.42,1] + color[2]=[0.00,1,0.5,1] + #color = ['0','0.5','0','0.5','0','0.5'] + style = ['--','--','--','--','--','--'] + dashes = [(1,0),(5,1),(5,1,1,1),(2,1,2,1),(1,1),(5,5)] + + for heuristic, c, s, d in zip(heuristics,color, style, dashes): + many_sorted_timings = [] + for i in range(rep): + if heuristic=='virtual_best': + rawtimings = [timing[target] for timing, target in zip(timings, targets)] + else: + guesses = [choose_order_given_projections(projection, method=heuristic) for projection in projections] + rawtimings = [timing[guess] for timing, guess in zip(timings, guesses)] + sorted_timings = sort([timing for timing, all_orders_timing in zip(rawtimings,timings) if type(timing)!=str and timing=len(st) else st[i] for st in many_sorted_timings])/len( many_sorted_timings) for i in range(avg_len)] + return avg_sorted_timings + +def create_adversarial_plot( + heuristic1 = 'gmods', + heuristic2 = 'avegavegdeg' + ): + '''This function creates an adversarial plot comparing the desired heuristics.''' + + dataset = get_dataset(without_repetition=True, minimum_time_to_consider=0) + # we always want all examples here + projections, _, timings, heuristics_costs, ncells = dataset + + guesses1 = [choose_order_given_projections(projection, method=heuristic1) for projection in projections] + rawtimings1 = [timing[guess] for timing, guess in zip(timings, guesses1)] + timings1 = [timing if type(timing)!=str and timing30: + return 60 + else: + return 30 + + +def minimum_indices(given_list): + ''' + Returns the indices containing the minima of a list. + Helpful function for the function above + ''' + minimum = min(given_list) + return [index for index, value in enumerate(given_list) if value == minimum] + + +def multiplyList(myList) : + ''' + Multiplies all the elements in a list + ''' + result = 1 + for x in myList: + result = result * x + return result + + +def all_combinations(l): + ''' + Returns all possible combinations of a given list. + More concretely, all possible subsets ordered in all possible ways. + ''' + combs_with_order = [] + for i in range(1,len(l)+1): + combs=list(itertools.combinations(l,i)) + for comb in combs: + combs_with_order+=list(itertools.permutations(comb,i)) + return combs_with_order + + +def all_combinations_fixed_length(l, i): + ''' + Returns all possible combinations of a given list. + More concretely, all possible subsets ordered in all possible ways. + ''' + combs_with_order = [] + combs=list(itertools.combinations(l,i)) + for comb in combs: + combs_with_order+=list(itertools.permutations(comb,i)) + return combs_with_order + + +def trim_dataset(dataset, minimum_time_to_consider=0): + ''' + Returns the dataset containing only the problems that took + at least 'minimum_time_to_consider' seconds to finish. + ''' + projections, targets, timings, heuristics_costs, ncells = dataset + new_projections = [projection for projection, timing, target in zip(projections, timings, targets) if timing[target]>minimum_time_to_consider] + new_targets = [target for target, timing in zip(targets, timings) if timing[target]>minimum_time_to_consider] + new_timings = [timing for timing, target in zip(timings, targets) if timing[target]>minimum_time_to_consider] + new_heuristics_costs = [heuristics_cost for heuristics_cost, timing, target in zip(heuristics_costs, timings, targets) if timing[target]>minimum_time_to_consider] + new_ncells = [ncells for ncells, timing, target in zip(ncells, timings, targets) if timing[target]>minimum_time_to_consider] + + return new_projections, new_targets, new_timings, new_heuristics_costs, new_ncells + + +def get_dataset(without_repetition=True, return_ncells=True, minimum_time_to_consider=0): + ''' + Uploads the desired dataset from its location + ''' + + if without_repetition: + aux_name = 'without_repetition' + else: + aux_name = 'with_repetition' + + if return_ncells: + dataset_location = os.path.join(os.path.dirname(__file__), '..','Datasets','ThreeVariableSMTLIB2021','dataset_'+aux_name+'_return_ncells.txt') + #dataset_location = 'C:\\Users\\delriot\\OneDrive - Coventry University\\03Repositories\\01DEWCADCoventry\\Datasets\\dataset_without_repetition_return_ncells.txt' + else: + dataset_location = os.path.join(os.path.dirname(__file__), '..','Datasets','dataset_'+aux_name+'.txt') + + f = open(dataset_location, 'rb') + dataset = pickle.load(f) + f.close() + + return trim_dataset(dataset, minimum_time_to_consider=minimum_time_to_consider) + + +def aveg_of_not_zero(given_list): + ''' + Takes the average of a list without considering the elements that are 0.''' + s= sum(given_list) + if s>0: + return s/sum([1 for elem in given_list if elem>0]) + else: + return 0 + +def substract_two_timings(time1, time2): + '''time1 minus time2''' + if type(time1) is str and type(time2) is str: + return 0 + elif type(time1) is str and type(time2) is not str: + return 30 + elif type(time1) is not str and type(time2) is str: + return -30 + elif type(time1) is not str and type(time2) is not str: + return time1-time2 + +# This is how to save the best features +file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'best_features') +# best_features = ['summaxdeg', 'avegavegdeg', 'sumsumdeg', 'avegavegsigndeg' , 'sumsignsumdeg', 'summaxsvdeg']# , 'sumsumsigndeg', 'sumsumsvdeg' +# with open(file_path, 'w') as file: +# json.dump(best_features, file) + +# This is how to load the best features +with open(file_path, 'r') as file: + best_features = json.load(file) + +paper_all_pos = all_combinations(best_features) +indices = list(range(len(best_features))) +paper_all_indices = [str(elem).replace(', ','>').replace('(','').replace(')','') for elem in all_combinations(indices)] +existing_heuristics = ['brown', 'mods', 36, 'random', 'virtual_best'] # 36 is gmods +survival_plot_heuristics = ['virtual_best', 36, 'brown'] +ml_models = [] + diff --git a/Heuristics/heuristics_guess.py b/Heuristics/heuristics_guess.py new file mode 100644 index 0000000..01bdc29 --- /dev/null +++ b/Heuristics/heuristics_guess.py @@ -0,0 +1,97 @@ +''' +This file contains the functions that given all projections with +all possible orderings, return the ordering that would have been +choose by the desired heuristic. +''' + +import random +from math import factorial +from .heuristics_rules import * +from .heuristic_tools import greedy_heuristics, expensive_heuristics, create_pseudorderings, ml_models + + +def choose_order_given_projections(projections, method="gmods"): + '''Returns the order guessed by the heuristic requested''' + if method in greedy_heuristics or type(method) == int: + guess = greedy_heuristic_guess(projections, heuristic=method) + return guess + elif method in expensive_heuristics: + return no_greedy_heuristic_guess(projections, heuristic=method) + elif method in ml_models: + return ml_model_guess(projections, method=method) + else: + raise Exception(f'Heuristic not recognised:{method}.') + + +def greedy_heuristic_guess(projections:list, heuristic:str="gmods"): + ''' + This function is especialized in greedy heuristics. + One variable is picked at a time, adjusting the ordering accordingly. + ''' + order = 0 # we start assuming that the best order is the first one + nvar = len(projections[0]) # the number of variables corresponds with the length of the list describing one of the projections + + for i in range(nvar): + # projections[order] is the projection that if chosen order we assume to be the best. All orders we can still choose from are equal to this one until this point + try: + if heuristic != 'greedy_sotd': + new_var = greedy_choose_variable(projections[order][i], heuristic=heuristic) + elif i < nvar-1: + new_var = greedy_choose_variable([projections[ordering][i+1] for ordering in range(factorial(nvar)) if projections[ordering][i]== projections[order][i]], heuristic=heuristic) + else: + new_var = 0 + except IndexError: + # The reason of this error is probably that the computation of the projection did not go further, in this case we return the current order + return order + + if type(new_var) == str: + return order + order = order + factorial(nvar-i-1) * new_var # the best order is updated with the new information + return order # the final best order is returned + + +def not_greedy_heuristic_guess(original_polynomials: list, + heuristic: str = "gmods"): + ''' + This function is especialized in not greedy heuristics. + All variables are picked from the original polynomials. + ''' + order = 0 # we start assuming that the best order is the first one + order_measure = get_order_measure(heuristic, if_tie=None) + degrees_list, nvar = get_degree_list(original_polynomials, heuristic) + variables = list(range(nvar)) + ordering = [] + + while len(variables) != 0: + best_vars = variables + for measure in order_measure: + best_vars = choose_variables_minimizing(degrees_list, measure='gmods', var_list=best_vars) + random.shuffle(best_vars) + # print('best vars shuffled', best_vars) + ordering += best_vars + variables = [var for var in variables if var not in ordering] + assignment = {'[0, 1, 2]': 0, '[0, 2, 1]': 1, + '[1, 0, 2]': 2, '[1, 2, 0]': 3, + '[2, 0, 1]': 4, '[2, 1, 0]': 5, + } + order = assignment[str(ordering)] + # order = order + factorial(nvar-i-1) * new_var # the best order is updated with the new information + return order # the final best order is returned + + +def no_greedy_heuristic_guess(projections:list, heuristic:str="old_mods"): + ''' + Looking at the same time at all the projections, + the no greedy heuristics make an ordering choice. + ''' + if heuristic == "sotd": + sotd_values = [sum([degree for level in projection for polynomial in level for monomial in polynomial for degree in monomial[:-1]]) for projection in projections] + return min(range(len(sotd_values)), key=sotd_values.__getitem__) # returns the index with the smallest value in the list sotd_values + elif heuristic in ["old_mods", "logmods", "mods", "acc_logmods"]: + nvar = len(projections[0]) + pseudorderings = create_pseudorderings(nvar) + relevant_degrees = [[[max([monomial[var] for monomial in polynomial]) for polynomial in level] for level,var in zip(projection,pseudordering)] for projection, pseudordering in zip(projections, pseudorderings)] # This returns a list of lists, each of those lists correspond to a projection. Those lists contain lists of the degrees of the polynomials in each level wrt the variable that will be projected after. + heuristic_dict = {'old_mods':old_mods_guess, 'mods':mods_guess,'logmods':logmods_guess, 'acc_logmods':acc_logmods_guess} + return heuristic_dict[heuristic](relevant_degrees) + else: + raise Exception("Heuristic "+heuristic+" not found.") diff --git a/Heuristics/heuristics_rules.py b/Heuristics/heuristics_rules.py new file mode 100644 index 0000000..60ff9bf --- /dev/null +++ b/Heuristics/heuristics_rules.py @@ -0,0 +1,193 @@ +''' +This folder contains all the details necessary for the +different heuristics to make their choices. +''' + +import os +import numpy as np +from math import log +import itertools +import random +from .heuristic_tools import multiplyList, all_combinations, minimum_indices, aveg_of_not_zero, paper_all_pos + + +def choose_variables_minimizing(degrees_list, measure='gmods', var_list=''): + '''Given a list the degrees of polynomials returns the list of variables that minimise the measure desired''' + if measure != 'greedy_sotd': + nvar = len(degrees_list[0][0]) # the number of variables will be the same everywhere, we check the first monomial of the first polynomial + else: + nvar = len(degrees_list[0][0][0]) + if var_list == '': # if the value is the default one + var_list = range(nvar) + + if measure == 'gmods': + sum_degree_polys = [sum([max([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degree_polys)] # var_list is filtered + if measure == 'ali_aveg': + av_degree_polys_with_var = [aveg_of_not_zero([max([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(av_degree_polys_with_var)] # var_list is filtered + elif measure == 'greedy_logmods': + sum_degrees_overall_polys = [sum([log(max([1]+[monomial[var] for monomial in polynomial])) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] + elif measure == 'brown1': + max_degrees_polywise = [max([max([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the maximum degree in the polynomials is computed. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(max_degrees_polywise)] + elif measure == 'brown2': + max_degrees_polywise = [max([max([0]+[monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the maximum degree in the polynomials is computed. + + degrees_of_monomials_with_max_degrees = [max([max([0]+[sum(monomial) for monomial in polynomial if monomial[var]==max_degrees_polywise[var]]) for polynomial in degrees_list]) for var in var_list] # for each variable, the maximum degree in the polynomials is computed. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(degrees_of_monomials_with_max_degrees)] + elif measure == 'brown3': + number_appearances = [sum([sum([np.sign(monomial[var]) for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # the number of monomials in which the variables appear is counted + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(number_appearances)] + # elif measure == 'avegmaxsvdeg': + # sum_degrees_overall_polys = [np.average([max([sum(monomial) for monomial in polynomial if monomial[var]>0]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + # elif measure == 'maxsumsvdeg': + # sum_degrees_overall_polys = [max([sum([sum(monomial) for monomial in polynomial if monomial[var]>0]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + # elif measure == 'avegsumsvdeg': + # sum_degrees_overall_polys = [np.average([sum([sum(monomial) for monomial in polynomial if monomial[var]>0]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + # elif measure == 'avegsumdeg': + # sum_degrees_overall_polys = [np.average([sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + # elif measure == 'avegavegdeg': + # aveg_degrees_overall_polys = [np.average([np.average([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(aveg_degrees_overall_polys)] # var_list is filtered + + # elif measure == 'maxsumdeg': + # sum_degrees_overall_polys = [max([sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + elif measure == 'sumsignsumdeg': + sum_degrees_overall_polys = [np.sum(np.sign([np.sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list])) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + # elif measure == 'sumsumdeg': + # sum_degrees_overall_polys = [sum([sum([monomial[var] for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + # elif measure == 'avegvegsigndeg': + # sum_degrees_overall_polys = [np.average([np.average([np.sign(monomial[var]) for monomial in polynomial]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + # elif measure == 'sumsumsvdeg': + # sum_degrees_overall_polys = [sum([sum([sum(monomial) for monomial in polynomial if monomial[var]>0]) for polynomial in degrees_list]) for var in var_list] # for each variable, the total degree of each polynomial is computed. Then for each variable this values are added because is what we really care about. + # return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_degrees_overall_polys)] # var_list is filtered + elif measure == 'greedy_sotd': + sum_total_degrees = [sum([sum(monomial) for polynomial in possible_proj_set for monomial in polynomial]) for possible_proj_set in degrees_list] + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(sum_total_degrees)] # var_list is filtered + elif measure == 'random': + return [random.choice(var_list)] + elif measure == 'first': + return [var_list[0]] + elif measure == 'last': + return [var_list[-1]] + elif type(measure)==str: + if measure[-5:]=='svdeg': + measure = measure[:-5] + monomial_numbers = [[[sum(monomial) for monomial in polynomial if monomial[var]>0] for polynomial in degrees_list] for var in var_list] + elif measure[-7:]=='signdeg': + measure = measure[:-7] + monomial_numbers = [[[np.sign(monomial[var]) for monomial in polynomial] for polynomial in degrees_list] for var in var_list] + elif measure[-3:]== 'deg': + measure = measure[:-3] + monomial_numbers = [[[monomial[var] for monomial in polynomial] for polynomial in degrees_list] for var in var_list] + else: + raise Exception(measure+" is not a valid measure") + + if measure[-3:] == 'sum': + measure = measure[:-3] + polynomial_numbers = [[sum(monomial_numbers_in_poly) for monomial_numbers_in_poly in var_monomial_numbers] for var_monomial_numbers in monomial_numbers] + elif measure[-3:] == 'max': + measure = measure[:-3] + polynomial_numbers = [[max(monomial_numbers_in_poly) if len(monomial_numbers_in_poly)>0 else 0 for monomial_numbers_in_poly in var_monomial_numbers] for var_monomial_numbers in monomial_numbers] + elif measure[-4:] == 'aveg': + measure = measure[:-4] + polynomial_numbers = [[np.average(monomial_numbers_in_poly) for monomial_numbers_in_poly in var_monomial_numbers] for var_monomial_numbers in monomial_numbers] + else: + raise Exception("Not a valid measure - maybe add the possibility of sign here") + + if measure == 'sum': + set_numbers = [sum(var_polynomial_numbers) for var_polynomial_numbers in polynomial_numbers] + elif measure == 'max': + set_numbers = [max(var_polynomial_numbers) if len(var_polynomial_numbers)>0 else 0 for var_polynomial_numbers in polynomial_numbers] + elif measure == 'aveg': + set_numbers = [np.average(var_polynomial_numbers) for var_polynomial_numbers in polynomial_numbers] + else: + raise Exception("Not a valid measure") + return [var_list[i] for i in range(len(var_list)) if i in minimum_indices(set_numbers)] # var_list is filtered + + +def get_order_measure(heuristic, if_tie='random'): + if heuristic == 'brown': + order_measure = ['brown1', 'brown2', 'brown3', if_tie] + elif type(heuristic) == int: + order_measure = list(paper_all_pos[heuristic])+[if_tie] + else: + order_measure = [heuristic, if_tie] + return order_measure + + +def get_degree_list(poly_list, heuristic): + if heuristic != 'greedy_sotd': + degrees_list = [[monomial[:-1] for monomial in polynomial] for polynomial in poly_list] # the same list without the coefficients + nvar = len(degrees_list[0][0]) # the number of variables will be the same everywhere, we check the first monomial of the first polynomial + else: + degrees_list = [[[monomial[:-1] for monomial in polynomial] for polynomial in polys] for polys in poly_list] # the same list without the coefficients + nvar = len(degrees_list[0][0][0]) + # if degrees_list == []: # idk why this happens but we just return this sentence + # return "The list given is empty" + return degrees_list, nvar + + +def greedy_choose_variable(poly_list, heuristic='gmods'): + '''Given a list of polynomials returns the variable that the gmods heuristic would choose to project next''' + + order_measure = get_order_measure(heuristic, if_tie='random') + degrees_list, nvar = get_degree_list(poly_list, heuristic) + best_vars = range(nvar) + n_random_choice = 1 + while len(best_vars) > 1: + measure = order_measure.pop(0) + if measure == 'random': + # if we reach random we save how many variables are left + n_random_choice = len(best_vars) + best_vars = choose_variables_minimizing(degrees_list, measure=measure, var_list=best_vars) + # The following three lines are just used to answer a question from the reviewers + if nvar == 3 and (heuristic == 'gmods' or heuristic == 36 or heuristic == 'brown'): + file_random_name = os.path.join(os.path.dirname(__file__), '..', 'Datasets', f"{heuristic}_random_choices.txt") + with open(file_random_name, 'a') as f: + f.write(f"{n_random_choice}, ") + return best_vars[0] + + +## +# Rules for expensive heuristics +## + +def old_mods_guess(mrd):#mrd->old_mods_relevant_degrees + '''Computes the best order according to the old_mods heuristic (multiplication of relative degrees).''' + old_mods_values = [multiplyList([sum([degree for degree in level_mrd if degree!=0]) for level_mrd in proj_mrd]) for proj_mrd in mrd] + return min(range(len(old_mods_values)), key=old_mods_values.__getitem__) # returns the index with the smallest value in the list old_mods_values + + +def logmods_guess(mrd): + '''Computes the best order according to the logmods heuristic (multiplication of the logarithm of relative degrees).''' + logmods_values = [multiplyList([sum([log(degree) for degree in level_mrd if degree!=0]) for level_mrd in proj_mrd]) for proj_mrd in mrd] + return min(range(len(logmods_values)), key=logmods_values.__getitem__) # returns the index with the smallest value in the list logmods_values + + +def mods_guess(mrd): + '''Computes the best ordering minimizing the maximum number of cells in the final CAD.''' + mods_values = [multiplyList([1+2*sum([degree for degree in level_mrd if degree!=0]) for level_mrd in proj_mrd]) for proj_mrd in mrd] + return min(range(len(mods_values)), key=mods_values.__getitem__) # returns the index with the smallest value in the list old_mods_values + + +def super_mods_guess(mrd): + '''Computes the best ordering minimizing the maximum number of cells in all the CADs needed to build the final CAD.''' + mods_values = [sum([multiplyList([1+2*sum([degree for degree in level_mrd if degree!=0]) for level_mrd in proj_mrd[:i+1]]) for i in range(len(proj_mrd))])for proj_mrd in mrd] + return min(range(len(mods_values)), key=mods_values.__getitem__) # returns the index with the smallest value in the list old_mods_values + + +def acc_logmods_guess(mrd): + '''Computes the best order according to the logmods heuristic (multiplication of the logarithm of relative degrees).''' + acc_logmods_values = [multiplyList([1+2*sum([log(degree) for degree in level_mrd if degree!=0]) for level_mrd in proj_mrd]) for proj_mrd in mrd] + return min(range(len(acc_logmods_values)), key=acc_logmods_values.__getitem__) # returns the index with the smallest value in the list logmods_values diff --git a/choose_hyperparams.py b/choose_hyperparams.py index 53e51c1..8a9a54c 100644 --- a/choose_hyperparams.py +++ b/choose_hyperparams.py @@ -3,7 +3,7 @@ import csv from config.ml_models import ml_models from config.ml_models import sklearn_models -from config.ml_models import dataset_types +from config.general_values import dataset_qualities from config.hyperparameters_grid import grid from sklearn.model_selection import GridSearchCV from yaml_tools import write_yaml_to_file @@ -56,17 +56,17 @@ def choose_hyperparams(ml_model, method): # 'ml_results_k_fold_tested_in_balanced.csv') # with open(output_file_balanced, 'w') as f_balanced: # writer_balanced = csv.writer(f_balanced) -# writer_balanced.writerow(["Name"] + dataset_types) +# writer_balanced.writerow(["Name"] + dataset_qualities) # output_file_normal = os.path.join(os.path.dirname(__file__), # 'ml_results_k_fold_tested_in_normal.csv') # with open(output_file_normal, 'w') as f_normal: # writer_normal = csv.writer(f_normal) -# writer_normal.writerow(["Name"] + dataset_types) +# writer_normal.writerow(["Name"] + dataset_qualities) # for ml_model in ml_models: # print(f"Model: {ml_model}") # acc_balanced = dict() # acc_normal = dict() -# for method in dataset_types: +# for method in dataset_qualities: # this_dataset_file = os.path.join(os.path.dirname(__file__), # 'datasets', 'train', # f'{method}_train_dataset.txt') @@ -91,10 +91,10 @@ def choose_hyperparams(ml_model, method): # pickle.dump(clf, method_file) # round_accuracies_balanced = [round(acc, 2) # for acc in [acc_balanced[method_here] -# for method_here in dataset_types]] +# for method_here in dataset_qualities]] # round_accuracies_normal = [round(acc, 2) # for acc in [acc_normal[method_here] -# for method_here in dataset_types]] +# for method_here in dataset_qualities]] # writer_balanced.writerow([ml_model] + round_accuracies_balanced) # writer_normal.writerow([ml_model] + round_accuracies_normal) diff --git a/create_clean_dataset.py b/create_clean_dataset.py index 4d60cc6..97c3114 100644 --- a/create_clean_dataset.py +++ b/create_clean_dataset.py @@ -17,7 +17,7 @@ def create_dataframe(dataset): all_features = [] - all_targets = dataset[1][:] + all_labels = dataset[1][:] all_timings = dataset[2][:] all_original_polynomials = [] for index, all_projections in enumerate(dataset[0]): @@ -28,7 +28,7 @@ def create_dataframe(dataset): determine_standarization=True, determine_unique_features=True) return np.array(all_original_polynomials), np.array(names),\ - np.array(all_features), np.array(all_targets), np.array(all_timings) + np.array(all_features), np.array(all_labels), np.array(all_timings) # dataset_filename = os.path.join(os.path.dirname(__file__), @@ -36,7 +36,7 @@ def create_dataframe(dataset): # 'dataset_without_repetition_return_ncells.txt') # with open(dataset_filename, 'rb') as f: # dataset = pickle.load(f) -# original_polys_list, names, features_list, targets_list, timings_list =\ +# original_polys_list, names, features_list, labels_list, timings_list =\ # create_dataframe(dataset) @@ -63,6 +63,7 @@ def cleaning_dataset(): for key in my_dataset: if key not in clean_dataset: clean_dataset[key] = my_dataset[key] + print("CLEAN", clean_dataset.keys()) with open(clean_dataset_filename, 'wb') as clean_dataset_file: pickle.dump(clean_dataset, clean_dataset_file) diff --git a/datasets/clean_dataset.txt b/datasets/clean_dataset.txt index 24c1b18..b7d2456 100644 Binary files a/datasets/clean_dataset.txt and b/datasets/clean_dataset.txt differ diff --git a/datasets/test/augmented_test_dataset.txt b/datasets/test/augmented_test_dataset.txt index e1ab3ef..36c4323 100644 Binary files a/datasets/test/augmented_test_dataset.txt and b/datasets/test/augmented_test_dataset.txt differ diff --git a/datasets/test/balanced_test_dataset.txt b/datasets/test/balanced_test_dataset.txt index 1712401..1a6fd17 100644 Binary files a/datasets/test/balanced_test_dataset.txt and b/datasets/test/balanced_test_dataset.txt differ diff --git a/datasets/test/normal_test_dataset.txt b/datasets/test/normal_test_dataset.txt index 72b027d..5b49302 100644 Binary files a/datasets/test/normal_test_dataset.txt and b/datasets/test/normal_test_dataset.txt differ diff --git a/datasets/train/augmented_train_dataset.txt b/datasets/train/augmented_train_dataset.txt index 3ed1c22..a52fea7 100644 Binary files a/datasets/train/augmented_train_dataset.txt and b/datasets/train/augmented_train_dataset.txt differ diff --git a/datasets/train/balanced_train_dataset.txt b/datasets/train/balanced_train_dataset.txt index 0b42a35..28fd1ac 100644 Binary files a/datasets/train/balanced_train_dataset.txt and b/datasets/train/balanced_train_dataset.txt differ diff --git a/datasets/train/normal_train_dataset.txt b/datasets/train/normal_train_dataset.txt index 21b4a44..347a778 100644 Binary files a/datasets/train/normal_train_dataset.txt and b/datasets/train/normal_train_dataset.txt differ diff --git a/main.py b/main.py index 04571f1..ab1064e 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ """ The experiments in [1] are replicated with some changes. -The first change is that the testing data is balanced, so that all targets +The first change is that the testing data is balanced, so that all labels are almost equally common. Then we use three training sets; dataset as in [1], balanced dataset and data augmentation dataset. @@ -14,7 +14,8 @@ """ import csv from config.ml_models import ml_models -from config.ml_models import dataset_types +from config.general_values import dataset_qualities +from config.general_values import purposes from find_filename import find_dataset_filename from find_filename import find_model_filename from create_clean_dataset import cleaning_dataset @@ -32,19 +33,19 @@ # tune_hyperparameters = False paradigm = 'classification' -# cleaning_dataset() +cleaning_dataset() create_train_test_datasets() # if tune_hyperparameters: # for ml_model in ml_models: -# for method in dataset_types: +# for method in dataset_qualities: # print(f"Choosing hyperparameters for {ml_model} in {method}") # choose_hyperparams(ml_model, method) -# for ml_model in ml_models: -# print(f"Training {ml_model}") -# for method in dataset_types: -# print(f"for {method}") -# train_model(ml_model, method) +for ml_model in ml_models: + print(f"Training {ml_model}") + for method in dataset_qualities: + print(f"for {method}") + train_model(ml_model, method) training_method = 'augmented' testing_method = 'augmented' first_time = 1 @@ -56,6 +57,7 @@ first_time = 0 keys = list(metrics.keys()) with open(output_file, 'a') as f: + f.write('No more cheating\n') f.write(', '.join(['Model'] + keys) + '\n') with open(output_file, 'a', newline='') as f: writer = csv.writer(f) @@ -70,7 +72,7 @@ # with open("classification_output_timings.csv", 'w') as f: # f.write("model, Normal, Balanced, Augmented\n") # for ml_model in ml_models: -# for training_method in dataset_types: +# for training_method in dataset_qualities: # trained_model_filename = find_model_filename(training_method, # ml_model) # accuracy = test_model(trained_model_filename, diff --git a/packages/dataset_manipulation/dataset_manipulation.py b/packages/dataset_manipulation/dataset_manipulation.py index b92263e..4e18fa8 100644 --- a/packages/dataset_manipulation/dataset_manipulation.py +++ b/packages/dataset_manipulation/dataset_manipulation.py @@ -9,7 +9,7 @@ nvar = 3 -def augmentate_dataset(features, targets, timings): +def augmentate_dataset(features, targets, timings, cells): """ Multiply the size of the dataset by 6. @@ -20,16 +20,19 @@ def augmentate_dataset(features, targets, timings): symmetric_features = [] symmetric_targets = [] symmetric_timings = [] - for features, target, timing in zip(features, targets, timings): + symmetric_cells = [] + for features, target, timing, cell in \ + zip(features, targets, timings, cells): symmetric_features += give_all_symmetries(features, int(target)) symmetric_targets += list(range(math.factorial(nvar))) symmetric_timings += augmentate_timings(timing, int(target)) + symmetric_cells += augmentate_timings(cell, int(target)) return np.array(symmetric_features), np.array(symmetric_targets), \ - np.array(symmetric_timings) + np.array(symmetric_timings), np.array(symmetric_cells) -def balance_dataset(features, targets, timings): +def balance_dataset(features, targets, timings, cells): """ Balance the dataset so all targets are almost equally common. @@ -40,15 +43,19 @@ def balance_dataset(features, targets, timings): balanced_features = [] balanced_targets = [] balanced_timings = [] - for features, target, timing in zip(features, targets, timings): + balanced_cells = [] + for features, target, timing, cell in \ + zip(features, targets, timings, cells): symmetric_features = give_all_symmetries(features, int(target)) symmetric_timings = augmentate_timings(timing, int(target)) + symmetric_cells = augmentate_timings(cell, int(target)) new_target = random.choice(list(range(math.factorial(nvar)))) balanced_features.append(symmetric_features[new_target]) balanced_targets.append(new_target) balanced_timings.append(symmetric_timings[new_target]) + balanced_cells.append(symmetric_cells[new_target]) return np.array(balanced_features), np.array(balanced_targets),\ - np.array(balanced_timings) + np.array(balanced_timings), np.array(balanced_cells) def name_unique_features(names, features): @@ -94,7 +101,8 @@ def remove_notunique_features(names, features, nvar=3): # creating some targets and timing because the function requires them targets = [0]*len(features) timings = [list(range(math.factorial(nvar)))]*len(features) - augmented_features, _, _ = augmentate_dataset(features, targets, timings) + cells = [list(range(math.factorial(nvar)))]*len(features) + augmented_features, _, _, _ = augmentate_dataset(features, targets, timings, cells) # normalized_augmented_features = normalize(augmented_features) unique_names = name_unique_features(names, augmented_features) unique_features = [] diff --git a/replicating_Dorians_features.py b/replicating_Dorians_features.py index 3260c82..a65bccf 100644 --- a/replicating_Dorians_features.py +++ b/replicating_Dorians_features.py @@ -64,7 +64,7 @@ def create_features(degrees, variable=0, sv=False, def extract_features(dataset): my_dataset = dict() all_features = [] - all_targets = [] + all_labels = [] all_timings = [] all_original_polynomials = [] all_projections = [] @@ -75,7 +75,7 @@ def extract_features(dataset): # the original polynomials are the initial polynomials of any # of the possible projections (also of the first one) all_original_polynomials.append(original_polynomials) - all_targets.append(dataset[1][index]) + all_labels.append(dataset[1][index]) all_timings.append(dataset[2][index]) all_cells.append(dataset[3][index]) names, instance_features = features_from_set_of_polys( @@ -84,7 +84,7 @@ def extract_features(dataset): my_dataset['polynomials'] = np.array(all_original_polynomials) my_dataset['names'] = np.array(names) my_dataset['features'] = np.array(all_features) - my_dataset['targets'] = np.array(all_targets) + my_dataset['labels'] = np.array(all_labels) my_dataset['timings'] = np.array(all_timings) my_dataset['projections'] = np.array(all_projections) my_dataset['cells'] = np.array(all_cells) diff --git a/test_train_datasets.py b/test_train_datasets.py index fd53ce1..8777fcc 100644 --- a/test_train_datasets.py +++ b/test_train_datasets.py @@ -10,6 +10,8 @@ from packages.dataset_manipulation import remove_notunique_features from packages.dataset_manipulation import balance_dataset from packages.dataset_manipulation import augmentate_dataset +from config.general_values import purposes +from config.general_values import dataset_qualities from sklearn.model_selection import train_test_split from find_filename import find_dataset_filename from find_filename import find_other_filename @@ -30,57 +32,87 @@ def create_train_test_datasets(): # maybe it's better to create a dictionary for each dataset: # train/test, normal/balanced/augmented ### - x = dict() # to keep the features - y = dict() # to keep the labels - t = dict() # to keep the timings - p = dict() # to keep the projections - c = dict() # to keep the number of cells + datasets = dict() + for purpose in purposes: + for quality in dataset_qualities: + datasets[purpose + '_' + quality] = dict() # train and test sets are created random_state = 0 - x['train_normal'], x['test_normal'], \ - y['train_normal'], y['test_normal'], \ - t['train_normal'], t['test_normal'], \ - p['train_normal'], p['test_normal'] = \ + print(dataset.keys()) + datasets['Train_Normal']['features'], \ + datasets['Test_Normal']['features'], \ + datasets['Train_Normal']['labels'], \ + datasets['Test_Normal']['labels'], \ + datasets['Train_Normal']['timings'], \ + datasets['Test_Normal']['timings'], \ + datasets['Train_Normal']['projections'], \ + datasets['Test_Normal']['projections'], \ + datasets['Train_Normal']['cells'], \ + datasets['Test_Normal']['cells'] = \ train_test_split(dataset['features'], - dataset['targets'], + dataset['labels'], dataset['timings'], dataset['projections'], + dataset['cells'], test_size=0.20, random_state=random_state) - for purpose in ['train', 'test']: - x[f'{purpose}_balanced'], y[f'{purpose}_balanced'], t[f'{purpose}_balanced'] = balance_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) - x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) - dataset_info_file = find_dataset_filename('instances') - with open(dataset_info_file, 'w') as f_dataset_info: - writer = csv.writer(f_dataset_info) - writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total']) - for usage in ['train', 'test']: - for method in ['normal', 'balanced', 'augmented']: - this_dataset_filename = find_dataset_filename(usage, method=method) - with open(this_dataset_filename, 'wb') as this_dataset_file: - if method == 'normal': - pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}'], p[f'{usage}_{method}']), this_dataset_file) - else: - pickle.dump((x[f'{usage}_{method}'], y[f'{usage}_{method}'], t[f'{usage}_{method}']), this_dataset_file) + keys = ['features', 'labels', 'timings', 'cells'] + for purpose in purposes: + datasets[f'{purpose}_Balanced'] = \ + {key: elem for key, elem in zip(keys, + balance_dataset(*[datasets[f'{purpose}_Normal'][key2] + for key2 in keys]) + ) + } + datasets[f'{purpose}_Augmented'] = \ + {key: elem for key, elem in zip(keys, + augmentate_dataset(*[datasets[f'{purpose}_Normal'][key2] + for key2 in keys]) + ) + } + for purpose in purposes: + for quality in dataset_qualities: + this_dataset_filename = find_dataset_filename(purpose, method=quality) + with open(this_dataset_filename, 'wb') as this_dataset_file: + pickle.dump(datasets[purpose + '_' + quality], this_dataset_file) + + + ## The following code is to count how many instances of each are there in the different datasets + ## Sould be substitute by another function + + # {datasets[f'{purpose}_balanced'][key]: elem for elem in balance_dataset(datasets[f'{purpose}_balanced'][key2] for key2 in keys) for key in keys} + # x[f'{purpose}_augmented'], y[f'{purpose}_augmented'], t[f'{purpose}_augmented'] = augmentate_dataset(x[f'{purpose}_normal'], y[f'{purpose}_normal'], t[f'{purpose}_normal']) +# dataset_info_file = find_dataset_filename('instances') +# with open(dataset_info_file, 'w') as f_dataset_info: +# writer = csv.writer(f_dataset_info) +# writer.writerow(['dataset'] + ['zero', 'one', 'two', 'three', 'four', 'five', 'total']) +# for purpose in purposes: +# for method in ['normal', 'balanced', 'augmented']: +# this_dataset_filename = find_dataset_filename(purpose, method=method) +# with open(this_dataset_filename, 'wb') as this_dataset_file: +# if method == 'normal': +# pickle.dump((x[f'{purpose}_{method}'], y[f'{purpose}_{method}'], t[f'{purpose}_{method}'], p[f'{purpose}_{method}']), this_dataset_file) +# else: +# pickle.dump((x[f'{purpose}_{method}'], y[f'{purpose}_{method}'], t[f'{purpose}_{method}']), this_dataset_file) - writer.writerow([f'{usage} {method} dataset'] - + [str(count_instances(y[f'{usage}_{method}'], i)) - for i in range(6)] - + [str(len(y[f'{usage}_{method}']))]) +# writer.writerow([f'{purpose} {method} dataset'] +# + [str(count_instances(y[f'{purpose}_{method}'], i)) +# for i in range(6)] +# + [str(len(y[f'{purpose}_{method}']))]) def create_regression_datasets(taking_logarithms=True): - for usage in ['train', 'test']: - this_dataset_filename = find_dataset_filename(usage, + for purpose in purposes: + this_dataset_filename = find_dataset_filename(purpose, method='augmented') # we will use the augmented dataset here with open(this_dataset_filename, 'rb') as this_dataset_file: - X, Y, T = pickle.load(this_dataset_file) + regression_dataset = pickle.load(this_dataset_file) if taking_logarithms: - Y = [log(timings[0]) for timings in T] + regression_dataset['labels'] = [log(timings[0]) for timings in regression_dataset['timings']] else: - Y = [timings[0] for timings in T] + regression_dataset['labels'] = [timings[0] for timings in regression_dataset['timings']] this_dataset_filename =\ - find_dataset_filename(usage, method='regression') + find_dataset_filename(purpose, method='regression') with open(this_dataset_filename, 'wb') as this_dataset_file: - pickle.dump((X, Y, T), this_dataset_file) + pickle.dump(regression_dataset, this_dataset_file) diff --git a/train_models.py b/train_models.py index 84a2921..a344e39 100644 --- a/train_models.py +++ b/train_models.py @@ -11,47 +11,46 @@ def train_model(ml_model, method): - train_data_filename = find_dataset_filename('train', method=method) + train_data_filename = find_dataset_filename('Train', method=method) hyperparams_file = find_hyperparams_filename(method, ml_model) with open(train_data_filename, 'rb') as train_data_file: - if method == "Normal": - x_train, y_train, _, _ = pickle.load(train_data_file) - else: - x_train, y_train, _ = pickle.load(train_data_file) - # a = pickle.load(train_data_file) - # print(a[0], type(a), len(a), method) + train_dataset = pickle.load(train_data_file) hyperparams = read_yaml_from_file(hyperparams_file) current_classifier = sklearn_models[ml_model] clf = current_classifier(**hyperparams) - clf.fit(x_train, y_train) + print("DATaset", train_dataset.keys()) + clf.fit(train_dataset['features'], train_dataset['labels']) trained_model_filename = find_model_filename(method, ml_model) with open(trained_model_filename, 'wb') as trained_model_file: pickle.dump(clf, trained_model_file) def train_regression_model(ml_model, method): - train_data_filename = find_dataset_filename('train', method=method) + train_data_filename = find_dataset_filename('Train', method=method) with open(train_data_filename, 'rb') as train_data_file: - x_train, _, t_train = pickle.load(train_data_file) + train_dataset = pickle.load(train_data_file) # hyperparams_file = find_hyperparams_filename(method, ml_model) # hyperparams = read_yaml_from_file(hyperparams_file) - x_train = np.asarray([x_t for x_t, t_t in zip(x_train, t_train) - if t_t[:4] != 'Over'], dtype=float) - t_train = np.asarray([t_t for t_t in t_train - if t_t[:4] != 'Over'], dtype=float) - current_classifier = regressors[ml_model] - # print(t_train) + train_dataset['features'] = np.asarray([x_t for x_t, t_t in zip(train_dataset['features'], train_dataset['timings']) + if t_t[:4] != 'Over'], dtype=float) + train_dataset['timings'] = np.asarray([t_t for t_t in train_dataset['timings'] + if t_t[:4] != 'Over'], dtype=float) + #### + # IS THIS REALLY DOING SOMTHING? + # What if we used twice timelimit instead + current_classifier = ml_regressors[ml_model] + # print(train_dataset['timings']) print("her") reg = current_classifier() # **hyperparams) - reg.fit(x_train, t_train) + reg.fit(train_dataset['features'], train_dataset['timings']) # trained_model_filename = find_model_filename(method, ml_model, 'regression') # with open(trained_model_filename, 'wb') as trained_model_file: # pickle.dump(reg, trained_model_file) print("Real") - print(t_train[10:20]) + print(train_dataset['timings'][10:20]) print("Predicted") - print(reg.predict(x_train)[10:20]) - print(metrics.mean_squared_error(reg.predict(x_train), t_train)) + print(reg.predict(train_dataset['features'])[10:20]) + print(metrics.mean_squared_error(reg.predict(train_dataset['features']), train_dataset['timings'])) return reg @@ -61,7 +60,7 @@ def choose_using_regression(x_test, regressor): def test_regression_model(method, regressor): - test_data_filename = find_dataset_filename('test', method=method) + test_data_filename = find_dataset_filename('Test', method=method) with open(test_data_filename, 'rb') as test_data_file: x_test, y_test, t_test = pickle.load(test_data_file) x_test = np.asarray([x_t for x_t, t_t in zip(x_test, t_test) @@ -69,7 +68,6 @@ def test_regression_model(method, regressor): y_test = np.asarray([y_t for y_t, t_t in zip(y_test, t_test) if t_t[:4] != 'Over'], dtype=float) y_pred = [choose_using_regression(x_i, regressor) for x_i in x_test] - print("ACC", metrics.accuracy_score(y_test, y_pred)) # for ml_reg in ml_regressors: