Source code for rumboost.datasets

import numpy as np
import pandas as pd
import pickle
import random
import sys
import gc

from lightgbm import Dataset
from collections import Counter, defaultdict

try:
    from sklearn.model_selection import train_test_split, GroupShuffleSplit, GroupKFold

    sklearn_installed = True
except ImportError:
    sklearn_installed = False

sys.path.append("../")


[docs] def load_preprocess_LPMC( path="/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/rumboost-dev/Data/", ): """ Load and preprocess the LPMC dataset. Returns ------- dataset_train : pandas Dataframe The training dataset ready to use. dataset_test : pandas Dataframe The training dataset ready to use. folds : zip(list, list) 5 folds of indices grouped by household for CV. """ # source: https://github.com/JoseAngelMartinB/prediction-behavioural-analysis-ml-travel-mode-choice data_train = pd.read_csv(path + "LPMC_train.csv") data_test = pd.read_csv(path + "LPMC_test.csv") # data_train_2 = pd.read_csv('Data/LTDS_train.csv') # data_test_2 = pd.read_csv('Data/LTDS_test.csv') # distance in km data_train["distance"] = data_train["distance"] / 1000 data_test["distance"] = data_test["distance"] / 1000 # #cyclical start time # data_train['start_time_linear_cos'] = np.cos(data_train['start_time_linear']*(2.*np.pi/24)) # data_train['start_time_linear_sin'] = np.sin(data_train['start_time_linear']*(2.*np.pi/24)) # data_test['start_time_linear_cos'] = np.cos(data_test['start_time_linear']*(2.*np.pi/24)) # data_test['start_time_linear_sin'] = np.sin(data_test['start_time_linear']*(2.*np.pi/24)) # #cyclical travel month # data_train['travel_month_cos'] = np.cos(data_train_2['travel_month']*(2.*np.pi/12)) # data_train['travel_month_sin'] = np.sin(data_train_2['travel_month']*(2.*np.pi/12)) # data_test['travel_month_cos'] = np.cos(data_test_2['travel_month']*(2.*np.pi/12)) # data_test['travel_month_sin'] = np.sin(data_test_2['travel_month']*(2.*np.pi/12)) # for market segmentation # data_train['weekend'] = (data_train['day_of_week'] > 5).apply(int) # data_test['weekend'] = (data_test['day_of_week'] > 5).apply(int) # rename label label_name = {"travel_mode": "choice"} dataset_train = data_train.rename(columns=label_name) dataset_test = data_test.rename(columns=label_name) # get all features target = "choice" features = [f for f in dataset_test.columns if f != target] # get household ids hh_id = np.array(data_train["household_id"].values) # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open( path + "strat_group_k_fold_london.pickle", "rb", ) ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( data_train[features], data_train["travel_mode"], hh_id, k=5 ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open( path + "strat_group_k_fold_london.pickle", "wb", ), ) folds = zip(train_idx, test_idx) return dataset_train, dataset_test, folds
[docs] def load_preprocess_SwissMetro( test_size: float = 0.3, random_state: int = 42, full_data=False, path="../Data/", ): """ Load and preprocess the SwissMetro dataset. See Biogeme website for data. Parameters ---------- test_size : float, optional (default = 0.3) The proportion of data used for test set. random_state : int, optional (default = 42) For reproducibility in the train-test split full_data : bool, optional (default = False) If the full dataset should be returned. path : str, optional The path to the data. Returns ------- dataset_train : pandas Dataframe The training dataset ready to use. dataset_test : pandas Dataframe The training dataset ready to use. """ if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") df = pd.read_csv(path + "swissmetro.dat", sep="\t") label_name = {"CHOICE": "choice"} # remove irrelevant choices and purposes keep = ((df["CHOICE"] != 0) * (df["CAR_AV"] == 1)) > 0 df = df[keep] # apply cost to people without GA df.loc[:, "TRAIN_COST"] = df["TRAIN_CO"] * (df["GA"] == 0) df.loc[:, "SM_COST"] = df["SM_CO"] * (df["GA"] == 0) # rescale choice from 0 to 2 df.loc[:, "CHOICE"] = df["CHOICE"] - 1 # luggage dummies df.loc[:, "SEV_LUGGAGES"] = (df["LUGGAGE"] == 3).astype(int) # origin df.loc[:, "ORIG_ROM"] = df["ORIGIN"].apply( lambda x: 1 if x in [10, 22, 23, 24, 25, 26] else 0 ) df.loc[:, "ORIG_TIC"] = df["ORIGIN"].apply(lambda x: 1 if x in [21] else 0) # dest df.loc[:, "DEST_ROM"] = df["DEST"].apply( lambda x: 1 if x in [10, 22, 23, 24, 25, 26] else 0 ) df.loc[:, "DEST_TIC"] = df["DEST"].apply(lambda x: 1 if x in [21] else 0) # purpose df.loc[:, "PURPOSE_1"] = (df["PURPOSE"] == 1).astype(int) df.loc[:, "PURPOSE_2"] = (df["PURPOSE"] == 2).astype(int) df.loc[:, "PURPOSE_3"] = (df["PURPOSE"] == 3).astype(int) df.loc[:, "PURPOSE_4"] = (df["PURPOSE"] == 4).astype(int) df.loc[:, "PURPOSE_5"] = (df["PURPOSE"] == 5).astype(int) df.loc[:, "PURPOSE_6"] = (df["PURPOSE"] == 6).astype(int) df.loc[:, "PURPOSE_7"] = (df["PURPOSE"] == 7).astype(int) df.loc[:, "PURPOSE_8"] = (df["PURPOSE"] == 8).astype(int) # age df.loc[:, "AGE_1"] = (df["AGE"] == 1).astype(int) df.loc[:, "AGE_2"] = (df["AGE"] == 2).astype(int) df.loc[:, "AGE_3"] = (df["AGE"] == 3).astype(int) df.loc[:, "AGE_4"] = (df["AGE"] == 4).astype(int) # # keep only travel times below 6 hours # df = df[df["TRAIN_TT"] < 360] # df = df[df["SM_TT"] < 360] # df = df[df["CAR_TT"] < 360] # final dataset df_final = df[ [ "ID", "TRAIN_TT", "TRAIN_COST", "TRAIN_HE", "SM_TT", "SM_COST", "SM_HE", "CAR_TT", "CAR_CO", "MALE", "FIRST", "PURPOSE_1", "PURPOSE_2", "PURPOSE_3", "PURPOSE_4", "PURPOSE_5", "PURPOSE_6", "PURPOSE_7", "PURPOSE_8", "AGE_1", "AGE_2", "CHOICE", ] ] #'SM_SEATS', 'SEV_LUGGAGES','ORIG_ROM', 'ORIG_TIC', 'DEST_ROM', 'DEST_TIC', 'AGE_3', 'AGE_4', df_final = df_final.rename(columns=label_name) if full_data: return df_final # split by household gsp = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) train_idx, test_idx = next(gsp.split(df_final, groups=df_final["ID"])) df_train, df_test = df_final.iloc[train_idx], df_final.iloc[test_idx] hh_id = df_train["ID"] # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open(path + "strat_group_k_fold_swissmetro.pickle", "rb") ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( df_train[ [ "TRAIN_TT", "TRAIN_COST", "TRAIN_HE", "SM_TT", "SM_COST", "SM_HE", "CAR_TT", "CAR_CO", ] ], df_train["choice"], hh_id, k=5, seed=random_state, ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open(path + "strat_group_k_fold_swissmetro.pickle", "wb"), ) folds = zip(train_idx, test_idx) return df_train, df_test, folds
[docs] def load_preprocess_Optima(): """ Load and preprocess the Optima dataset. See Biogeme website for data. Returns ------- dataset_train : pandas Dataframe The training dataset ready to use. dataset_test : pandas Dataframe The training dataset ready to use. folds : zip(list, list) 5 folds of indices grouped by household for CV. """ if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") # source: https://github.com/JoseAngelMartinB/prediction-behavioural-analysis-ml-travel-mode-choice data_train = pd.read_csv("../Data/optima_ext_train.csv") data_test = pd.read_csv("../Data/optima_ext_test.csv") # get household ids hh_id = np.array(data_train["ID"].values) # rename label and drop IDs label_name = {"Choice": "choice"} data_train = data_train.rename(columns=label_name) data_test = data_test.rename(columns=label_name) dataset_train = data_train.drop("ID", axis=1) dataset_test = data_test.drop("ID", axis=1) # get all features target = "choice" features = [f for f in dataset_train.columns if f != target] # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open("../Data/strat_group_k_fold_optima.pickle", "rb") ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( dataset_train[features], dataset_train[target], hh_id, k=5 ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open("../Data/strat_group_k_fold_optima.pickle", "wb"), ) folds = zip(train_idx, test_idx) return dataset_train, dataset_test, folds
[docs] def load_preprocess_Netherlands(test_size: float = 0.3, random_state: int = 42): """Load and preprocess the Netherlands dataset. See Biogeme website for data.""" if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") pandas = pd.read_table("../Data/netherlands.dat") pandas_rp = pandas[pandas["rp"] == 1] pandas_rp.loc[:, "rail_time"] = ( pandas_rp.loc[:, "rail_ivtt"] + pandas_rp.loc[:, "rp_rail_ovt"] ) pandas_rp.loc[:, "car_time"] = ( pandas_rp.loc[:, "car_ivtt"] + pandas_rp.loc[:, "rp_car_ovt"] ) pandas_rp.loc[:, "car_cost_euro"] = pandas_rp.loc[:, "car_cost"] * 0.44378022 pandas_rp.loc[:, "rail_cost_euro"] = pandas_rp.loc[:, "rail_cost"] * 0.44378022 pandas_rp = pandas_rp.drop( [ "rp", "sp", "rail_comfort", "rail_ivtt", "rail_cost", "rail_acc_time", "rail_egr_time", "rail_transfers", "car_ivtt", "car_cost", "car_walk_time", "rp_choice", "rp_rail_ovt", "rp_car_ovt", ], axis=1, ) # database = db.Database("netherlands",pandas) df_train, df_test = train_test_split( pandas_rp, test_size=test_size, random_state=random_state ) # get all features target = "choice" features = [f for f in df_train.columns if f != target] # get household ids hh_id = np.array(df_train["id"].values) # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open("../Data/strat_group_k_fold_netherlands.pickle", "rb") ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( df_train[features], df_train[target], hh_id, k=5 ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open("../Data/strat_group_k_fold_netherlands.pickle", "wb"), ) folds = zip(train_idx, test_idx) return df_train, df_test, folds
[docs] def load_preprocess_Airplane(test_size: float = 0.3, random_state: int = 42): """Load and preprocess the Airplane dataset. See Biogeme website for data.""" if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") pandas = pd.read_table("Data/airline.dat") pandas["choice"] = (pandas["BestAlternative_2"] == 1) + 2 * ( pandas["BestAlternative_3"] == 1 ) pandas = pandas.drop( ["BestAlternative_1", "BestAlternative_2", "BestAlternative_3"], axis=1 ) pandas.loc[:, "Fare_1_scaled"] = pandas["Fare_1"] / 100 pandas.loc[:, "Fare_2_scaled"] = pandas["Fare_2"] / 100 pandas.loc[:, "Fare_3_scaled"] = pandas["Fare_3"] / 100 pandas.loc[:, "TTDIFF_TRANSFER"] = ( pandas["TripTimeHours_2"] - pandas["TripTimeHours_1"] ) pandas.loc[:, "TTDIFF_TRANSFER_TWOAIRLINES"] = ( pandas["TripTimeHours_3"] - pandas["TripTimeHours_1"] ) df = pandas[ [ "DepartureTimeHours_1", "DepartureTimeHours_2", "DepartureTimeHours_3", "ArrivalTimeHours_1", "ArrivalTimeHours_2", "ArrivalTimeHours_3", "TTDIFF_TRANSFER", "TTDIFF_TRANSFER_TWOAIRLINES", "Legroom_1", "Legroom_2", "Legroom_3", "Fare_1_scaled", "Fare_2_scaled", "Fare_3_scaled", "choice", ] ] # database = db.Database("netherlands",pandas) df_train, df_test = train_test_split( df, test_size=test_size, random_state=random_state ) # get all features target = "choice" features = [f for f in df_train.columns if f != target] # get household ids hh_id = df_train.index.tolist() # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open("../Data/strat_group_k_fold_airplane.pickle", "rb") ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( df_train[features], df_train[target], hh_id, k=5 ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open("../Data/strat_group_k_fold_airplane.pickle", "wb"), ) folds = zip(train_idx, test_idx) return df_train, df_test, folds
[docs] def load_preprocess_Telephone(test_size: float = 0.3, random_state: int = 3): """Load and preprocess the Telephone dataset. See Biogeme website for data.""" if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") pandas = pd.read_table("Data/telephone.dat") pandas.loc[:, "choice"] = pandas["choice"] - 1 pandas.loc[:, "cost1_scaled"] = pandas["cost1"] / 10 pandas.loc[:, "cost2_scaled"] = pandas["cost2"] / 10 pandas.loc[:, "cost3_scaled"] = pandas["cost3"] / 10 pandas.loc[:, "cost4_scaled"] = pandas["cost4"] / 10 pandas.loc[:, "cost5_scaled"] = pandas["cost5"] / 10 # database = db.Database("netherlands",pandas) df_train, df_test = train_test_split( pandas, test_size=test_size, random_state=random_state ) # get all features target = "choice" features = [f for f in df_train.columns if f != target] # get household ids hh_id = df_train.index.tolist() # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open("../Data/strat_group_k_fold_telephone.pickle", "rb") ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( df_train[features], df_train[target], hh_id, k=5 ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open("../Data/strat_group_k_fold_telephone.pickle", "wb"), ) folds = zip(train_idx, test_idx) return df_train, df_test, folds
[docs] def load_preprocess_Parking(test_size: float = 0.3, random_state: int = 42): """Load and preprocess the Parking dataset. See Biogeme website for data.""" if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") pandas = pd.read_table("Data/parking.dat") pandas.loc[:, "CHOICE"] = pandas["CHOICE"] - 1 pandas = pandas.drop(["ID", "OBSID", "SCENARIO"], axis=1) label_name = {"CHOICE": "choice"} pandas = pandas.rename(columns=label_name) # database = db.Database("netherlands",pandas) df_train, df_test = train_test_split( pandas, test_size=test_size, random_state=random_state ) # get all features target = "choice" features = [f for f in df_train.columns if f != target] # get household ids hh_id = df_train.index.tolist() # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open("../Data/strat_group_k_fold_parking.pickle", "rb") ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( df_train[features], df_train[target], hh_id, k=5 ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open("../Data/strat_group_k_fold_parking.pickle", "wb"), ) folds = zip(train_idx, test_idx) return df_train, df_test, folds
[docs] def load_preprocess_Vaccines(): """Load and preprocess the Vaccines dataset.""" pandas = pd.read_csv("../Data/vaccinechoiceMar12.csv") # pandas.drop() pandas.loc[:, "choice"] = pandas["vaccinechoice"] - 1 new_names = { "cost.1": "cost1", "effectiveness.1": "effectiveness1", "protection.1": "protection1", "incubation.1": "incubation1", "severe.1": "severe1", "mild.1": "mild1", "doses.1": "doses1", "booster.1": "booster1", "USA.1": "USA1", "UK.1": "UK1", "Germany.1": "Germany1", "China.1": "China1", "Russia.1": "Russia1", "media.1": "media1", "CDC.1": "CDC1", "WHO.1": "WHO1", "months.1": "months1", "cost.3": "cost3", "effectiveness.3": "effectiveness3", "protection.3": "protection3", "incubation.3": "incubation3", "severe.3": "severe3", "mild.3": "mild3", "doses.3": "doses3", "booster.3": "booster3", "USA.3": "USA3", "UK.3": "UK3", "Germany.3": "Germany3", "China.3": "China3", "Russia.3": "Russia3", "media.3": "media3", "CDC.3": "CDC3", "WHO.3": "WHO3", "months.3": "months3", } pandas = pandas.rename(columns=new_names) pandas_cleaned = pandas.drop(["ID", "ZIP", "state"], axis=1) # pandas_cleaned = pandas[['IDnum','choice','cost1','effectiveness1','protection1','incubation1','severe1','mild1','doses1','booster1','USA1','UK1','Germany1','China1','Russia1','media1','CDC1','WHO1','months1','cost3','effectiveness3','protection3','incubation3','severe3','mild3','doses3','booster3','USA3','UK3','Germany3','China3','Russia3','media3','CDC3','WHO3','Male','Black','Democrat','covidpos','FluShot','babyboomolder','HHInc10K','BSc','PostGrad','Underlying','Wave4']] # pandas_cl_sampled = pandas_cleaned.groupby('IDnum').sample(n=1, random_state=2) df_train = pandas_cleaned[pandas_cleaned["Wave4"] != 1] df_test = pandas_cleaned[pandas_cleaned["Wave4"] == 1] # get all features target = "choice" features = [f for f in df_train.columns if f != target] # get household ids hh_id = df_train["IDnum"] # drop irrelevant features # df_train = df_train.drop(['IDnum', 'Wave4'], axis=1) # df_test = df_test.drop(['IDnum', 'Wave4'], axis=1) # get all features target = "choice" features = [f for f in df_train.columns if f != target] # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open("../Data/strat_group_k_fold_vaccine.pickle", "rb") ) except FileNotFoundError: for train_i, test_i in stratified_group_k_fold( df_train[features], df_train[target], hh_id, k=5 ): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open("../Data/strat_group_k_fold_vaccine.pickle", "wb"), ) folds = zip(train_idx, test_idx) return df_train, df_test, folds
[docs] def load_preprocess_MTMC( test_size: float = 0.2, random_state: int = 1, path="/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/rumboost/Data/", ): """ Load and preprocess the MTMC dataset. """ if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") # load data data = pd.read_csv(path + "data_laus_trips_prep_attractions_allalt.csv") # load destination zones z_idx = list(np.loadtxt(path + "z_idx.csv")) # split by household gsp = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) train_idx, test_idx = next(gsp.split(data, groups=data["HHNR"])) df_train, df_test = data.iloc[train_idx], data.iloc[test_idx] # get all features target = "choice" features = [f for f in df_train.columns if f != target] hh_id = df_train["HHNR"] # k folds sampled by households for cross validation train_idx = [] test_idx = [] try: train_idx, test_idx = pickle.load( open( path + "strat_group_k_fold_mtmc.pickle", "rb", ) ) except FileNotFoundError: gkf = GroupKFold() for train_i, test_i in gkf.split(df_train[features], df_train[target], hh_id): train_idx.append(train_i) test_idx.append(test_i) pickle.dump( [train_idx, test_idx], open( path + "strat_group_k_fold_mtmc.pickle", "wb", ), ) folds = zip(train_idx, test_idx) return df_train, df_test, folds, z_idx
[docs] def load_preprocess_MTMC_all( test_size: float = 0.2, random_state: int = 1, path="/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/rumboost/Data/", ): """ Load and preprocess the MTMC dataset for all swiss zones. """ if not sklearn_installed: raise ImportError("scikit-learn is required for this function.") try: z_idx = list(np.loadtxt(path + "z_idx_all_wo_alps.csv")) with open( path + "train_set_switzerland.pkl", "rb", ) as f: df_train = pickle.load(f) with open( path + "test_set_switzerland.pkl", "rb", ) as f: df_test = pickle.load(f) except FileNotFoundError: # load data with open( path + "/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/choice_set_location_travelmode/Data/input/data_switzerland_trips_preprocessed.pkl", "rb", ) as f: data = pickle.load(f) # load destination zones z_idx = list(np.loadtxt(path + "z_idx_all_wo_alps.csv")) zone_to_drop = [i for i in range(7965, 7978)] mask = ~data["d_idx"].isin(zone_to_drop) & ~data["o_idx"].isin(zone_to_drop) data = data[mask] # split by household gsp = GroupShuffleSplit( n_splits=1, test_size=test_size, random_state=random_state ) train_idx, test_idx = next(gsp.split(data, groups=data["HHNR"])) df_train, df_test = data.iloc[train_idx], data.iloc[test_idx] pickle.dump( df_train, open( path + "train_set_switzerland.pkl", "wb", ), ) pickle.dump( df_test, open( path + "test_set_switzerland.pkl", "wb", ), ) # try: # with open( # path + "strat_group_k_fold_mtmc_all.pickle", # "rb", # ) as f: # train_idx, test_idx = pickle.load(f) # except FileNotFoundError: # # get all features # target = "choice" # features = [f for f in df_train.columns if f != target] # hh_id = df_train["HHNR"] # # k folds sampled by households for cross validation # train_idx = [] # test_idx = [] # gkf = GroupKFold() # for train_i, test_i in gkf.split(df_train[features], df_train[target], hh_id): # train_idx.append(train_i) # test_idx.append(test_i) # pickle.dump( # [train_idx, test_idx], # open( # path + "strat_group_k_fold_mtmc_all.pickle", # "wb", # ), # ) # pickle.dump( # df_train, # open( # path + "train_set_switzerland.pkl", # "wb", # ), # ) # pickle.dump( # df_test, # open( # path + "test_set_switzerland.pkl", # "wb", # ), # ) # folds = zip(train_idx, test_idx) folds = None return df_train, df_test, folds, z_idx
# Sample a dataset grouped by `groups` and stratified by `y` # Source: https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
[docs] def stratified_group_k_fold(X, y, groups, k, seed=None): """ Stratified Group K-Fold cross-validator Provides train/test indices to split data in train/test sets. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. y : array-like of shape (n_samples,) The target values. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. k : int Number of folds. Must be at least 2. seed : int, optional Random seed for shuffling the data. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ labels_num = np.max(y) + 1 y_counts_per_group = defaultdict(lambda: np.zeros(labels_num)) y_distr = Counter() for label, g in zip(y, groups): y_counts_per_group[g][label] += 1 y_distr[label] += 1 y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num)) groups_per_fold = defaultdict(set) def eval_y_counts_per_fold(y_counts, fold): y_counts_per_fold[fold] += y_counts std_per_label = [] for label in range(labels_num): label_std = np.std( [y_counts_per_fold[i][label] / y_distr[label] for i in range(k)] ) std_per_label.append(label_std) y_counts_per_fold[fold] -= y_counts return np.mean(std_per_label) groups_and_y_counts = list(y_counts_per_group.items()) random.Random(seed).shuffle(groups_and_y_counts) for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])): best_fold = None min_eval = None for i in range(k): fold_eval = eval_y_counts_per_fold(y_counts, i) if min_eval is None or fold_eval < min_eval: min_eval = fold_eval best_fold = i y_counts_per_fold[best_fold] += y_counts groups_per_fold[best_fold].add(g) all_groups = set(groups) for i in range(k): train_groups = all_groups - groups_per_fold[i] test_groups = groups_per_fold[i] train_indices = [i for i, g in enumerate(groups) if g in train_groups] test_indices = [i for i, g in enumerate(groups) if g in test_groups] yield train_indices, test_indices
[docs] def prepare_dataset( rum_structure, df_train, num_classes, df_test=None, target="choice", free_raw_data=False, save_dataset=None, load_dataset=None, ): """ Prepare and save if required the datasets for RUMBoost. Parameters ---------- rum_structure : list of dict The structure of the RUM model. df_train : pandas DataFrame The training dataset. params : dict The parameters of the model. num_classes : int The number of classes. df_test : list of pandas DataFrame, optional The list of test datasets. target : str, optional The target variable. free_raw_data : bool, optional If the raw data should be freed. save_dataset : str, optional The path to save the datasets. load_dataset : str, optional The path to load the datasets. Returns ------- train_sets : dict The training datasets. valid_sets : dict The validation datasets. """ valid_sets = {} num_datasets = len(rum_structure) if load_dataset: try: with open(f"{load_dataset}_train_sets.pkl", "rb") as f: train_sets = pickle.load(f) if df_test is not None: with open(f"{load_dataset}_valid_sets.pkl", "rb") as f: valid_sets = pickle.load(f) except: raise FileNotFoundError( "Error loading datasets, try running again this function without the load_dataset parameter." ) train_set_J = [] reduced_valid_sets_J = [] try: for j, _ in enumerate(rum_structure): print( "-" * 30 + "\n" + f"[{j+1}/{num_datasets}] \t Loading dataset {j+1}..." ) train_set_J.append( Dataset( data=f"{load_dataset}_train_set_{j}.bin", free_raw_data=False ) ) if df_test is not None: reduced_valid_sets_j = [] for i, _ in enumerate(df_test): reduced_valid_sets_j.append( Dataset( data=f"{load_dataset}_valid_set_{j}_{i}.bin", free_raw_data=False, reference=train_set_J[j], ) ) reduced_valid_sets_J.append(reduced_valid_sets_j) print("\t done! \n" + "-" * 30 + "\n") except: raise FileNotFoundError( "Error loading dataset, try running again this function without the load_dataset parameter." ) train_sets["train_sets"] = train_set_J if df_test is not None: valid_sets["valid_sets"] = np.array(reduced_valid_sets_J).T.tolist() return train_sets, valid_sets labels = df_train[target].to_numpy().astype(np.int32) labels_j = (labels[:, None] == np.array(range(num_classes))[None, :]).astype( np.int8 ) num_obs = df_train.shape[0] if df_test is not None: valid_labels = [] num_obs_test = [] for df in df_test: valid_labels += [df[target].to_numpy().astype(np.int32)] num_obs_test += [df.shape[0]] val_labels_j = [ (val_labs[:, None] == np.array(range(num_classes))[None, :]).astype(np.int8) for val_labs in valid_labels ] train_set_J = [] reduced_valid_sets_J = [] for j, struct in enumerate(rum_structure): print("-" * 30 + "\n" + f"[{j+1}/{num_datasets}] \t Loading dataset {j+1}...") if struct: if "variables" in struct: train_set_j_data = df_train[ struct["variables"] ] # only relevant features for the jth booster if struct["shared"] == True: new_label = labels_j[ :, struct["utility"][: len(struct["variables"])] ].reshape(-1, order="F") feature_names = "auto" else: new_label = labels_j[:, 0].reshape(-1, order="F") feature_names = struct["variables"] train_set_j = Dataset( train_set_j_data.values.reshape((len(new_label), -1), order="A"), label=new_label, free_raw_data=free_raw_data, ) # create and build dataset categorical_feature = struct["boosting_params"].get( "categorical_feature", "auto" ) train_set_j._update_params(struct["boosting_params"])._set_predictor( None ).set_feature_name(feature_names).set_categorical_feature( categorical_feature ) if df_test is not None: reduced_valid_sets_j = [] for i, valid_set in enumerate(df_test): # create and build validation sets valid_set_j_data = valid_set[ struct["variables"] ] # only relevant features for the jth booster if struct["shared"] == True: label_valid = val_labels_j[i][ :, struct["utility"][: len(struct["variables"])] ].reshape(-1, order="F") else: label_valid = val_labels_j[i][:, 0].reshape(-1, order="F") valid_set_j = Dataset( valid_set_j_data.values.reshape( (len(label_valid), -1), order="A", ), label=label_valid, free_raw_data=False, reference=train_set_j, ) # create and build dataset valid_set_j._update_params(struct["boosting_params"]) reduced_valid_sets_j.append(valid_set_j) if save_dataset: valid_set_j.save_binary( f"{save_dataset}_valid_set_{j}_{i}.bin" ) train_set_J.append(train_set_j) if save_dataset: train_set_j.save_binary(f"{save_dataset}_train_set_{j}.bin") if df_test is not None: reduced_valid_sets_J.append(reduced_valid_sets_j) del ( train_set_j_data, valid_set_j_data, train_set_j, valid_set_j, ) gc.collect() print("\t done! \n" + "-" * 30 + "\n") else: # if no alternative specific datasets new_label = np.where(labels == j, 1, 0) train_set_j = Dataset( df_train.values, label=new_label, free_raw_data=False ) if df_test is not None: reduced_valid_sets_j = df_test[:] train_sets = {"num_data": num_obs, "labels": labels, "labels_j": labels_j} if df_test is not None: valid_sets = { "num_data": num_obs_test, "valid_labels": valid_labels, } if save_dataset: with open(f"{save_dataset}_train_sets.pkl", "wb") as f: pickle.dump(train_sets, f) if df_test is not None: with open(f"{save_dataset}_valid_sets.pkl", "wb") as f: pickle.dump(valid_sets, f) train_sets["train_sets"] = train_set_J if df_test is not None: reduced_valid_sets_J = np.array(reduced_valid_sets_J).T.tolist() valid_sets["valid_sets"] = reduced_valid_sets_J return train_sets, valid_sets