Source code for rumboost.post_process

import pandas as pd
import numpy as np
import lightgbm as lgb
import os
from rumboost.rumboost import RUMBoost, rum_train
from rumboost.utility_plotting import weights_to_plot_v2

from biogeme.expressions import Beta, Variable, bioMultSum
from biogeme.models import piecewise_formula, loglogit
from biogeme.biogeme import BIOGEME
import biogeme.database as db


[docs] def split_fe_model(model: RUMBoost): """ Split a functional effect model and returns its two parts Parameters ---------- model: RUMBoost A functional effect RUMBoost model with rum_structure Returns ------- attributes_model: RUMBoost The part of the functional effect model with trip attributes without interaction socio_economic_model: RUMBoost The part of the model leading to the individual-specific constant, where socio-economic characteristics fully interact. """ if not isinstance(model.rum_structure, list): raise ValueError( "Please add a rum_structure to your model by setting model.rum_structure. A rum_structure must be a list of 2*n_alt dictionaries in this function" ) attributes_model = RUMBoost() socio_economic_model = RUMBoost() attributes_model.boosters = [b for i, b in enumerate(model.boosters) if i % 2 == 0] attributes_model.rum_structure = model.rum_structure[::2] attributes_model.num_classes = model.num_classes attributes_model.device = model.device attributes_model.nests = model.nests attributes_model.alphas = model.alphas attributes_model.boost_from_parameter_space = model.boost_from_parameter_space[::2] attributes_model.asc = model.asc socio_economic_model.boosters = [ b for i, b in enumerate(model.boosters) if i % 2 == 1 ] socio_economic_model.rum_structure = model.rum_structure[1::2] socio_economic_model.num_classes = model.num_classes socio_economic_model.device = model.device socio_economic_model.nests = model.nests socio_economic_model.alphas = model.alphas socio_economic_model.boost_from_parameter_space = model.boost_from_parameter_space[ 1::2 ] socio_economic_model.asc = model.asc return attributes_model, socio_economic_model
[docs] def bootstrap( dataset: pd.DataFrame, model_specification: dict, num_it: int = 100, seed: int = 42, ): """ Performs bootstrapping, with given dataset, parameters and rum_structure. For now, only a basic rumboost can be used. Parameters ---------- dataset: pd.DataFrame A dataset used to train RUMBoost model_specification: dict A dictionary containing the model specification used to train the model. It should follow the same structure than in the rum_train() function. num_it: int, optional (default=100) The number of bootstrapping iterations seed: int, optional (default=42) The seed used to randomly sample the dataset. Returns ------- models: list Return a list containing all trained models. """ np.random.seed(seed) N = dataset.shape[0] models = [] for _ in range(num_it): ids = np.random.choice(dataset.index, size=N, replace=True) ids2 = np.setdiff1d(dataset.index, ids) df_train = dataset.loc[ids] df_test = dataset.loc[ids2] dataset_train = lgb.Dataset( df_train.drop("choice", axis=1), label=df_train.choice, free_raw_data=False ) valid_set = lgb.Dataset( df_test.drop("choice", axis=1), label=df_test.choice, free_raw_data=False ) models.append( rum_train(dataset_train, model_specification, valid_sets=[valid_set]) ) return models
[docs] def assist_model_spec( model: RUMBoost, dataset: pd.DataFrame, choice: pd.Series, alt_to_normalise: int = 0, return_utilities: bool = False, dataset_test: pd.DataFrame = None, choice_test: pd.Series = None, ): """ Provide a piece-wise linear model spcification based on a pre-trained rumboost model. Parameters ---------- model: RUMBoost A trained rumboost model. dataset: pd.DataFrame A dataset used to train the model choice: pd.Series A series containing the choices alt_to_normalise: int, optional (default=0) The variables of that alternative will be normalised when needed (socio-economic characteristics, ascs, ...). utilities: bool, optional (default=False) If True, the model will return the utility values, otherwise it will return the loglogit values. dataset_test: pd.DataFrame, optional (default=None) Only for predictions. If None, the dataset used to train the model will be used. choice_test: pd.Series, optional (default=None) A series containing the choices for the test dataset Returns ------- model_spec: dict A dictionary containing the model specification used to train a biogeme model. """ dataset["choice"] = choice if dataset_test is not None and choice_test is not None: dataset_test["choice"] = choice_test database = db.Database("rumboost", dataset) globals().update(database.variables) # define ascs, with one normalised to zero ascs = { f"asc_{i}": Beta(f"asc_{i}", 0, None, None, 1 if i == alt_to_normalise else 0) for i in range(model.num_classes) } # prepare variables to normalise vars_in_utility = {v: [] for v in dataset.columns} for rum in model.rum_structure: for v in rum["variables"]: vars_in_utility[v].extend(rum["utility"]) vars_to_normalise = [] for variables, utilities in vars_in_utility.items(): if len(np.unique(utilities)) == model.num_classes: vars_to_normalise.append(variables) # get aggregated split points and leaf values by ensembles and variables weights = weights_to_plot_v2(model) # initialise utility specification with ascs utility_spec = {i: ascs[f"asc_{i}"] for i in range(model.num_classes)} # store new variables created and split_points variables_created = {} # loop over the ensembles for i, weight in weights.items(): # loop over the variables within an ensemble for name, tree_info in weight.items(): # if linear if model.boost_from_parameter_space[int(i)]: split_points = tree_info["Splitting points"] init_beta = tree_info["Histogram values"] split_points.insert(0, dataset[name].min()) split_points.append(dataset[name].max()) # monotonicity constraints lowerbound = ( 0.0 if model.rum_structure[int(i)]["boosting_params"][ "monotone_constraints" ][0] == 1 else None ) upperbound = ( 0.0 if model.rum_structure[int(i)]["boosting_params"][ "monotone_constraints" ][0] == -1 else None ) # define betas if ( alt_to_normalise == model.rum_structure[int(i)]["utility"][0] and name in vars_to_normalise ): beta_fixed = 1 else: beta_fixed = 0 betas = [ Beta( f"b_{name}_{i}_{j}", init_beta[j], lowerbound, upperbound, beta_fixed, ) for j in range(len(split_points) - 1) ] # add piecewise linear variables to the proper utility function for u in model.rum_structure[int(i)]["utility"]: utility_spec[u] = utility_spec[u] + piecewise_formula( name, split_points, betas ) else: # if piece-wise constant split_points = tree_info["Splitting points"] init_beta = tree_info["Histogram values"] beta_0 = init_beta[0] init_beta = [i - beta_0 for i in init_beta] if ( alt_to_normalise == model.rum_structure[int(i)]["utility"][0] and name in vars_to_normalise ): beta_fixed = 1 else: beta_fixed = 0 # monotonicity constraints lowerbound = ( 0.0 if model.rum_structure[int(i)]["boosting_params"][ "monotone_constraints" ][0] == 1 else None ) upperbound = ( 0.0 if model.rum_structure[int(i)]["boosting_params"][ "monotone_constraints" ][0] == -1 else None ) # define betas if len(split_points) == 1: # if already binary beta_dict = { f"b_{name}_{i}_{0}": Beta( f"b_{name}_{i}_0", beta_0, lowerbound, upperbound, beta_fixed, ) } vars = [Variable(name)] else: # if non binary split_points.insert(0, dataset[name].min()) split_points.append(dataset[name].max()) # we normalise to zero the first beta beta_dict = { f"b_{name}_{i}_0": Beta(f"b_{name}_{i}_0", 0, None, None, 1) } # if monotonicity constraint, we use previous beta as lower/upper bound vars = [] for j in range(1, len(split_points) - 1): beta_dict[f"b_{name}_{i}_{j}"] = ( Beta( f"delta_{name}_{i}_{j}", init_beta[j] - init_beta[j - 1], lowerbound, upperbound, beta_fixed, ) + beta_dict[f"b_{name}_{i}_{j-1}"] ) if f"{name}_{i}_{j}" not in database.variables: database.define_variable( f"{name}_{i}_{j}", ( (Variable(name) - split_points[j]) * (Variable(name) - split_points[j + 1]) ) <= 0, ) variables_created[f"{name}_{i}_{j}"] = ( split_points[j], split_points[j + 1], ) vars.append(Variable(f"{name}_{i}_{j}")) for u in model.rum_structure[int(i)]["utility"]: utility_spec[u] = utility_spec[u] + bioMultSum( [b * v for b, v in zip(beta_dict.values(), vars)] ) availability = {i: 1 for i in range(model.num_classes)} if not return_utilities: logprob = loglogit(utility_spec, availability, Variable("choice")) # if dataset_test is provided, we use it to define the variables if dataset_test is not None: test_database = db.Database("rumboost_test", dataset_test) globals().update(test_database.variables) # we need to define the variables in the test database for var, sp in variables_created.items(): if var not in test_database.variables: test_database.define_variable( var, ( (Variable(var.split("_")[0]) - sp[0]) * (Variable(var.split("_")[0]) - sp[1]) ) <= 0, ) # we use the test database to create the biogeme object the_biogeme = BIOGEME(test_database, logprob) else: the_biogeme = BIOGEME(database, logprob) model_name = "assisted_model_pwlinear_lpmc" the_biogeme.modelName = model_name the_biogeme.calculateNullLoglikelihood(availability) return the_biogeme else: model_name = "assisted_model_utilities_pwlinear" utilities_expr = {str(i): utility_spec[i] for i in range(model.num_classes)} # if dataset_test is provided, we use it to define the variables if dataset_test is not None: test_database = db.Database("rumboost_test", dataset_test) globals().update(test_database.variables) # we need to define the variables in the test database for var, sp in variables_created.items(): if var not in test_database.variables: test_database.define_variable( var, ( (Variable(var.split("_")[0]) - sp[0]) * (Variable(var.split("_")[0]) - sp[1]) ) <= 0, ) # we use the test database to create the biogeme object the_biogeme = BIOGEME(test_database, utilities_expr) else: the_biogeme = BIOGEME(database, utilities_expr) the_biogeme.modelName = model_name the_biogeme.calculateNullLoglikelihood(availability) return the_biogeme
[docs] def estimate_dcm_with_assisted_spec( dataset: pd.DataFrame, choice: pd.Series, model: RUMBoost, dataset_name: str = "SwissMetro", ): """ Estimate a Discrete Choice Model (currently only logit) with a piece-wise linear model specification based on a pre-trained rumboost model. Parameters ---------- dataset: pd.DataFrame A dataset used to train the model choice: pd.Series A series containing the choices model: RUMBoost A trained rumboost model. dataset_name: str, optional (default="SwissMetro") The dataset name Returns ------- estimated_model: biogeme.results.bioResults """ the_biogeme = assist_model_spec(model, dataset, choice) current_directory = os.getcwd() os.chdir(current_directory + f"/results/{dataset_name}/assisted_specification/") # results = the_biogeme.estimate(recycle=True) results = the_biogeme.estimate() os.chdir(current_directory) return results
[docs] def predict_with_assisted_spec( dataset_train: pd.DataFrame, dataset_test: pd.DataFrame, choice_train: pd.Series, choice_test: pd.Series, model: RUMBoost, beta_values: dict, utilities: bool = False, ): """ Predict choices with a piece-wise linear model specification based on a pre-trained rumboost model. Parameters ---------- dataset_train: pd.DataFrame A dataset used for estimation dataset_test: pd.DataFrame A dataset used for prediction choice_train: pd.Series A series containing the training set choices choice_test: pd.Series A series containing the test set choices model: RUMBoost A trained rumboost model. beta_values: dict A dictionary containing the beta values of the model, estimated on the train set. utilities: bool, optional (default=False) If True, the model will return the utilities instead of the log-probs. Returns ------- prediction_results: biogeme.results.bioResults """ the_biogeme = assist_model_spec( model, dataset_train, choice_train, return_utilities=utilities, dataset_test=dataset_test, choice_test=choice_test, ) prediction_results = the_biogeme.simulate(beta_values) return prediction_results