import numpy as np
import pandas as pd
import pickle
import random
import sys
import gc
from lightgbm import Dataset
from collections import Counter, defaultdict
try:
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GroupKFold
sklearn_installed = True
except ImportError:
sklearn_installed = False
sys.path.append("../")
[docs]
def load_preprocess_LPMC(
path="/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/rumboost-dev/Data/",
):
"""
Load and preprocess the LPMC dataset.
Returns
-------
dataset_train : pandas Dataframe
The training dataset ready to use.
dataset_test : pandas Dataframe
The training dataset ready to use.
folds : zip(list, list)
5 folds of indices grouped by household for CV.
"""
# source: https://github.com/JoseAngelMartinB/prediction-behavioural-analysis-ml-travel-mode-choice
data_train = pd.read_csv(path + "LPMC_train.csv")
data_test = pd.read_csv(path + "LPMC_test.csv")
# data_train_2 = pd.read_csv('Data/LTDS_train.csv')
# data_test_2 = pd.read_csv('Data/LTDS_test.csv')
# distance in km
data_train["distance"] = data_train["distance"] / 1000
data_test["distance"] = data_test["distance"] / 1000
# #cyclical start time
# data_train['start_time_linear_cos'] = np.cos(data_train['start_time_linear']*(2.*np.pi/24))
# data_train['start_time_linear_sin'] = np.sin(data_train['start_time_linear']*(2.*np.pi/24))
# data_test['start_time_linear_cos'] = np.cos(data_test['start_time_linear']*(2.*np.pi/24))
# data_test['start_time_linear_sin'] = np.sin(data_test['start_time_linear']*(2.*np.pi/24))
# #cyclical travel month
# data_train['travel_month_cos'] = np.cos(data_train_2['travel_month']*(2.*np.pi/12))
# data_train['travel_month_sin'] = np.sin(data_train_2['travel_month']*(2.*np.pi/12))
# data_test['travel_month_cos'] = np.cos(data_test_2['travel_month']*(2.*np.pi/12))
# data_test['travel_month_sin'] = np.sin(data_test_2['travel_month']*(2.*np.pi/12))
# for market segmentation
# data_train['weekend'] = (data_train['day_of_week'] > 5).apply(int)
# data_test['weekend'] = (data_test['day_of_week'] > 5).apply(int)
# rename label
label_name = {"travel_mode": "choice"}
dataset_train = data_train.rename(columns=label_name)
dataset_test = data_test.rename(columns=label_name)
# get all features
target = "choice"
features = [f for f in dataset_test.columns if f != target]
# get household ids
hh_id = np.array(data_train["household_id"].values)
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open(
path + "strat_group_k_fold_london.pickle",
"rb",
)
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
data_train[features], data_train["travel_mode"], hh_id, k=5
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open(
path + "strat_group_k_fold_london.pickle",
"wb",
),
)
folds = zip(train_idx, test_idx)
return dataset_train, dataset_test, folds
[docs]
def load_preprocess_SwissMetro(
test_size: float = 0.3,
random_state: int = 42,
full_data=False,
path="../Data/",
):
"""
Load and preprocess the SwissMetro dataset. See Biogeme website for data.
Parameters
----------
test_size : float, optional (default = 0.3)
The proportion of data used for test set.
random_state : int, optional (default = 42)
For reproducibility in the train-test split
full_data : bool, optional (default = False)
If the full dataset should be returned.
path : str, optional
The path to the data.
Returns
-------
dataset_train : pandas Dataframe
The training dataset ready to use.
dataset_test : pandas Dataframe
The training dataset ready to use.
"""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
df = pd.read_csv(path + "swissmetro.dat", sep="\t")
label_name = {"CHOICE": "choice"}
# remove irrelevant choices and purposes
keep = ((df["CHOICE"] != 0) * (df["CAR_AV"] == 1)) > 0
df = df[keep]
# apply cost to people without GA
df.loc[:, "TRAIN_COST"] = df["TRAIN_CO"] * (df["GA"] == 0)
df.loc[:, "SM_COST"] = df["SM_CO"] * (df["GA"] == 0)
# rescale choice from 0 to 2
df.loc[:, "CHOICE"] = df["CHOICE"] - 1
# luggage dummies
df.loc[:, "SEV_LUGGAGES"] = (df["LUGGAGE"] == 3).astype(int)
# origin
df.loc[:, "ORIG_ROM"] = df["ORIGIN"].apply(
lambda x: 1 if x in [10, 22, 23, 24, 25, 26] else 0
)
df.loc[:, "ORIG_TIC"] = df["ORIGIN"].apply(lambda x: 1 if x in [21] else 0)
# dest
df.loc[:, "DEST_ROM"] = df["DEST"].apply(
lambda x: 1 if x in [10, 22, 23, 24, 25, 26] else 0
)
df.loc[:, "DEST_TIC"] = df["DEST"].apply(lambda x: 1 if x in [21] else 0)
# purpose
df.loc[:, "PURPOSE_1"] = (df["PURPOSE"] == 1).astype(int)
df.loc[:, "PURPOSE_2"] = (df["PURPOSE"] == 2).astype(int)
df.loc[:, "PURPOSE_3"] = (df["PURPOSE"] == 3).astype(int)
df.loc[:, "PURPOSE_4"] = (df["PURPOSE"] == 4).astype(int)
df.loc[:, "PURPOSE_5"] = (df["PURPOSE"] == 5).astype(int)
df.loc[:, "PURPOSE_6"] = (df["PURPOSE"] == 6).astype(int)
df.loc[:, "PURPOSE_7"] = (df["PURPOSE"] == 7).astype(int)
df.loc[:, "PURPOSE_8"] = (df["PURPOSE"] == 8).astype(int)
# age
df.loc[:, "AGE_1"] = (df["AGE"] == 1).astype(int)
df.loc[:, "AGE_2"] = (df["AGE"] == 2).astype(int)
df.loc[:, "AGE_3"] = (df["AGE"] == 3).astype(int)
df.loc[:, "AGE_4"] = (df["AGE"] == 4).astype(int)
# # keep only travel times below 6 hours
# df = df[df["TRAIN_TT"] < 360]
# df = df[df["SM_TT"] < 360]
# df = df[df["CAR_TT"] < 360]
# final dataset
df_final = df[
[
"ID",
"TRAIN_TT",
"TRAIN_COST",
"TRAIN_HE",
"SM_TT",
"SM_COST",
"SM_HE",
"CAR_TT",
"CAR_CO",
"MALE",
"FIRST",
"PURPOSE_1",
"PURPOSE_2",
"PURPOSE_3",
"PURPOSE_4",
"PURPOSE_5",
"PURPOSE_6",
"PURPOSE_7",
"PURPOSE_8",
"AGE_1",
"AGE_2",
"CHOICE",
]
] #'SM_SEATS', 'SEV_LUGGAGES','ORIG_ROM', 'ORIG_TIC', 'DEST_ROM', 'DEST_TIC', 'AGE_3', 'AGE_4',
df_final = df_final.rename(columns=label_name)
if full_data:
return df_final
# split by household
gsp = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
train_idx, test_idx = next(gsp.split(df_final, groups=df_final["ID"]))
df_train, df_test = df_final.iloc[train_idx], df_final.iloc[test_idx]
hh_id = df_train["ID"]
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open(path + "strat_group_k_fold_swissmetro.pickle", "rb")
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
df_train[
[
"TRAIN_TT",
"TRAIN_COST",
"TRAIN_HE",
"SM_TT",
"SM_COST",
"SM_HE",
"CAR_TT",
"CAR_CO",
]
],
df_train["choice"],
hh_id,
k=5,
seed=random_state,
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open(path + "strat_group_k_fold_swissmetro.pickle", "wb"),
)
folds = zip(train_idx, test_idx)
return df_train, df_test, folds
[docs]
def load_preprocess_Optima():
"""
Load and preprocess the Optima dataset. See Biogeme website for data.
Returns
-------
dataset_train : pandas Dataframe
The training dataset ready to use.
dataset_test : pandas Dataframe
The training dataset ready to use.
folds : zip(list, list)
5 folds of indices grouped by household for CV.
"""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
# source: https://github.com/JoseAngelMartinB/prediction-behavioural-analysis-ml-travel-mode-choice
data_train = pd.read_csv("../Data/optima_ext_train.csv")
data_test = pd.read_csv("../Data/optima_ext_test.csv")
# get household ids
hh_id = np.array(data_train["ID"].values)
# rename label and drop IDs
label_name = {"Choice": "choice"}
data_train = data_train.rename(columns=label_name)
data_test = data_test.rename(columns=label_name)
dataset_train = data_train.drop("ID", axis=1)
dataset_test = data_test.drop("ID", axis=1)
# get all features
target = "choice"
features = [f for f in dataset_train.columns if f != target]
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open("../Data/strat_group_k_fold_optima.pickle", "rb")
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
dataset_train[features], dataset_train[target], hh_id, k=5
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open("../Data/strat_group_k_fold_optima.pickle", "wb"),
)
folds = zip(train_idx, test_idx)
return dataset_train, dataset_test, folds
[docs]
def load_preprocess_Netherlands(test_size: float = 0.3, random_state: int = 42):
"""Load and preprocess the Netherlands dataset. See Biogeme website for data."""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
pandas = pd.read_table("../Data/netherlands.dat")
pandas_rp = pandas[pandas["rp"] == 1]
pandas_rp.loc[:, "rail_time"] = (
pandas_rp.loc[:, "rail_ivtt"] + pandas_rp.loc[:, "rp_rail_ovt"]
)
pandas_rp.loc[:, "car_time"] = (
pandas_rp.loc[:, "car_ivtt"] + pandas_rp.loc[:, "rp_car_ovt"]
)
pandas_rp.loc[:, "car_cost_euro"] = pandas_rp.loc[:, "car_cost"] * 0.44378022
pandas_rp.loc[:, "rail_cost_euro"] = pandas_rp.loc[:, "rail_cost"] * 0.44378022
pandas_rp = pandas_rp.drop(
[
"rp",
"sp",
"rail_comfort",
"rail_ivtt",
"rail_cost",
"rail_acc_time",
"rail_egr_time",
"rail_transfers",
"car_ivtt",
"car_cost",
"car_walk_time",
"rp_choice",
"rp_rail_ovt",
"rp_car_ovt",
],
axis=1,
)
# database = db.Database("netherlands",pandas)
df_train, df_test = train_test_split(
pandas_rp, test_size=test_size, random_state=random_state
)
# get all features
target = "choice"
features = [f for f in df_train.columns if f != target]
# get household ids
hh_id = np.array(df_train["id"].values)
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open("../Data/strat_group_k_fold_netherlands.pickle", "rb")
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
df_train[features], df_train[target], hh_id, k=5
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open("../Data/strat_group_k_fold_netherlands.pickle", "wb"),
)
folds = zip(train_idx, test_idx)
return df_train, df_test, folds
[docs]
def load_preprocess_Airplane(test_size: float = 0.3, random_state: int = 42):
"""Load and preprocess the Airplane dataset. See Biogeme website for data."""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
pandas = pd.read_table("Data/airline.dat")
pandas["choice"] = (pandas["BestAlternative_2"] == 1) + 2 * (
pandas["BestAlternative_3"] == 1
)
pandas = pandas.drop(
["BestAlternative_1", "BestAlternative_2", "BestAlternative_3"], axis=1
)
pandas.loc[:, "Fare_1_scaled"] = pandas["Fare_1"] / 100
pandas.loc[:, "Fare_2_scaled"] = pandas["Fare_2"] / 100
pandas.loc[:, "Fare_3_scaled"] = pandas["Fare_3"] / 100
pandas.loc[:, "TTDIFF_TRANSFER"] = (
pandas["TripTimeHours_2"] - pandas["TripTimeHours_1"]
)
pandas.loc[:, "TTDIFF_TRANSFER_TWOAIRLINES"] = (
pandas["TripTimeHours_3"] - pandas["TripTimeHours_1"]
)
df = pandas[
[
"DepartureTimeHours_1",
"DepartureTimeHours_2",
"DepartureTimeHours_3",
"ArrivalTimeHours_1",
"ArrivalTimeHours_2",
"ArrivalTimeHours_3",
"TTDIFF_TRANSFER",
"TTDIFF_TRANSFER_TWOAIRLINES",
"Legroom_1",
"Legroom_2",
"Legroom_3",
"Fare_1_scaled",
"Fare_2_scaled",
"Fare_3_scaled",
"choice",
]
]
# database = db.Database("netherlands",pandas)
df_train, df_test = train_test_split(
df, test_size=test_size, random_state=random_state
)
# get all features
target = "choice"
features = [f for f in df_train.columns if f != target]
# get household ids
hh_id = df_train.index.tolist()
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open("../Data/strat_group_k_fold_airplane.pickle", "rb")
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
df_train[features], df_train[target], hh_id, k=5
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open("../Data/strat_group_k_fold_airplane.pickle", "wb"),
)
folds = zip(train_idx, test_idx)
return df_train, df_test, folds
[docs]
def load_preprocess_Telephone(test_size: float = 0.3, random_state: int = 3):
"""Load and preprocess the Telephone dataset. See Biogeme website for data."""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
pandas = pd.read_table("Data/telephone.dat")
pandas.loc[:, "choice"] = pandas["choice"] - 1
pandas.loc[:, "cost1_scaled"] = pandas["cost1"] / 10
pandas.loc[:, "cost2_scaled"] = pandas["cost2"] / 10
pandas.loc[:, "cost3_scaled"] = pandas["cost3"] / 10
pandas.loc[:, "cost4_scaled"] = pandas["cost4"] / 10
pandas.loc[:, "cost5_scaled"] = pandas["cost5"] / 10
# database = db.Database("netherlands",pandas)
df_train, df_test = train_test_split(
pandas, test_size=test_size, random_state=random_state
)
# get all features
target = "choice"
features = [f for f in df_train.columns if f != target]
# get household ids
hh_id = df_train.index.tolist()
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open("../Data/strat_group_k_fold_telephone.pickle", "rb")
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
df_train[features], df_train[target], hh_id, k=5
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open("../Data/strat_group_k_fold_telephone.pickle", "wb"),
)
folds = zip(train_idx, test_idx)
return df_train, df_test, folds
[docs]
def load_preprocess_Parking(test_size: float = 0.3, random_state: int = 42):
"""Load and preprocess the Parking dataset. See Biogeme website for data."""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
pandas = pd.read_table("Data/parking.dat")
pandas.loc[:, "CHOICE"] = pandas["CHOICE"] - 1
pandas = pandas.drop(["ID", "OBSID", "SCENARIO"], axis=1)
label_name = {"CHOICE": "choice"}
pandas = pandas.rename(columns=label_name)
# database = db.Database("netherlands",pandas)
df_train, df_test = train_test_split(
pandas, test_size=test_size, random_state=random_state
)
# get all features
target = "choice"
features = [f for f in df_train.columns if f != target]
# get household ids
hh_id = df_train.index.tolist()
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open("../Data/strat_group_k_fold_parking.pickle", "rb")
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
df_train[features], df_train[target], hh_id, k=5
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open("../Data/strat_group_k_fold_parking.pickle", "wb"),
)
folds = zip(train_idx, test_idx)
return df_train, df_test, folds
[docs]
def load_preprocess_Vaccines():
"""Load and preprocess the Vaccines dataset."""
pandas = pd.read_csv("../Data/vaccinechoiceMar12.csv")
# pandas.drop()
pandas.loc[:, "choice"] = pandas["vaccinechoice"] - 1
new_names = {
"cost.1": "cost1",
"effectiveness.1": "effectiveness1",
"protection.1": "protection1",
"incubation.1": "incubation1",
"severe.1": "severe1",
"mild.1": "mild1",
"doses.1": "doses1",
"booster.1": "booster1",
"USA.1": "USA1",
"UK.1": "UK1",
"Germany.1": "Germany1",
"China.1": "China1",
"Russia.1": "Russia1",
"media.1": "media1",
"CDC.1": "CDC1",
"WHO.1": "WHO1",
"months.1": "months1",
"cost.3": "cost3",
"effectiveness.3": "effectiveness3",
"protection.3": "protection3",
"incubation.3": "incubation3",
"severe.3": "severe3",
"mild.3": "mild3",
"doses.3": "doses3",
"booster.3": "booster3",
"USA.3": "USA3",
"UK.3": "UK3",
"Germany.3": "Germany3",
"China.3": "China3",
"Russia.3": "Russia3",
"media.3": "media3",
"CDC.3": "CDC3",
"WHO.3": "WHO3",
"months.3": "months3",
}
pandas = pandas.rename(columns=new_names)
pandas_cleaned = pandas.drop(["ID", "ZIP", "state"], axis=1)
# pandas_cleaned = pandas[['IDnum','choice','cost1','effectiveness1','protection1','incubation1','severe1','mild1','doses1','booster1','USA1','UK1','Germany1','China1','Russia1','media1','CDC1','WHO1','months1','cost3','effectiveness3','protection3','incubation3','severe3','mild3','doses3','booster3','USA3','UK3','Germany3','China3','Russia3','media3','CDC3','WHO3','Male','Black','Democrat','covidpos','FluShot','babyboomolder','HHInc10K','BSc','PostGrad','Underlying','Wave4']]
# pandas_cl_sampled = pandas_cleaned.groupby('IDnum').sample(n=1, random_state=2)
df_train = pandas_cleaned[pandas_cleaned["Wave4"] != 1]
df_test = pandas_cleaned[pandas_cleaned["Wave4"] == 1]
# get all features
target = "choice"
features = [f for f in df_train.columns if f != target]
# get household ids
hh_id = df_train["IDnum"]
# drop irrelevant features
# df_train = df_train.drop(['IDnum', 'Wave4'], axis=1)
# df_test = df_test.drop(['IDnum', 'Wave4'], axis=1)
# get all features
target = "choice"
features = [f for f in df_train.columns if f != target]
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open("../Data/strat_group_k_fold_vaccine.pickle", "rb")
)
except FileNotFoundError:
for train_i, test_i in stratified_group_k_fold(
df_train[features], df_train[target], hh_id, k=5
):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open("../Data/strat_group_k_fold_vaccine.pickle", "wb"),
)
folds = zip(train_idx, test_idx)
return df_train, df_test, folds
[docs]
def load_preprocess_MTMC(
test_size: float = 0.2,
random_state: int = 1,
path="/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/rumboost/Data/",
):
"""
Load and preprocess the MTMC dataset.
"""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
# load data
data = pd.read_csv(path + "data_laus_trips_prep_attractions_allalt.csv")
# load destination zones
z_idx = list(np.loadtxt(path + "z_idx.csv"))
# split by household
gsp = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
train_idx, test_idx = next(gsp.split(data, groups=data["HHNR"]))
df_train, df_test = data.iloc[train_idx], data.iloc[test_idx]
# get all features
target = "choice"
features = [f for f in df_train.columns if f != target]
hh_id = df_train["HHNR"]
# k folds sampled by households for cross validation
train_idx = []
test_idx = []
try:
train_idx, test_idx = pickle.load(
open(
path + "strat_group_k_fold_mtmc.pickle",
"rb",
)
)
except FileNotFoundError:
gkf = GroupKFold()
for train_i, test_i in gkf.split(df_train[features], df_train[target], hh_id):
train_idx.append(train_i)
test_idx.append(test_i)
pickle.dump(
[train_idx, test_idx],
open(
path + "strat_group_k_fold_mtmc.pickle",
"wb",
),
)
folds = zip(train_idx, test_idx)
return df_train, df_test, folds, z_idx
[docs]
def load_preprocess_MTMC_all(
test_size: float = 0.2,
random_state: int = 1,
path="/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/rumboost/Data/",
):
"""
Load and preprocess the MTMC dataset for all swiss zones.
"""
if not sklearn_installed:
raise ImportError("scikit-learn is required for this function.")
try:
z_idx = list(np.loadtxt(path + "z_idx_all_wo_alps.csv"))
with open(
path + "train_set_switzerland.pkl",
"rb",
) as f:
df_train = pickle.load(f)
with open(
path + "test_set_switzerland.pkl",
"rb",
) as f:
df_test = pickle.load(f)
except FileNotFoundError:
# load data
with open(
path
+ "/media/nicolas-salvade/Windows/Users/DAF1/OneDrive - University College London/Documents/PhD - UCL/choice_set_location_travelmode/Data/input/data_switzerland_trips_preprocessed.pkl",
"rb",
) as f:
data = pickle.load(f)
# load destination zones
z_idx = list(np.loadtxt(path + "z_idx_all_wo_alps.csv"))
zone_to_drop = [i for i in range(7965, 7978)]
mask = ~data["d_idx"].isin(zone_to_drop) & ~data["o_idx"].isin(zone_to_drop)
data = data[mask]
# split by household
gsp = GroupShuffleSplit(
n_splits=1, test_size=test_size, random_state=random_state
)
train_idx, test_idx = next(gsp.split(data, groups=data["HHNR"]))
df_train, df_test = data.iloc[train_idx], data.iloc[test_idx]
pickle.dump(
df_train,
open(
path + "train_set_switzerland.pkl",
"wb",
),
)
pickle.dump(
df_test,
open(
path + "test_set_switzerland.pkl",
"wb",
),
)
# try:
# with open(
# path + "strat_group_k_fold_mtmc_all.pickle",
# "rb",
# ) as f:
# train_idx, test_idx = pickle.load(f)
# except FileNotFoundError:
# # get all features
# target = "choice"
# features = [f for f in df_train.columns if f != target]
# hh_id = df_train["HHNR"]
# # k folds sampled by households for cross validation
# train_idx = []
# test_idx = []
# gkf = GroupKFold()
# for train_i, test_i in gkf.split(df_train[features], df_train[target], hh_id):
# train_idx.append(train_i)
# test_idx.append(test_i)
# pickle.dump(
# [train_idx, test_idx],
# open(
# path + "strat_group_k_fold_mtmc_all.pickle",
# "wb",
# ),
# )
# pickle.dump(
# df_train,
# open(
# path + "train_set_switzerland.pkl",
# "wb",
# ),
# )
# pickle.dump(
# df_test,
# open(
# path + "test_set_switzerland.pkl",
# "wb",
# ),
# )
# folds = zip(train_idx, test_idx)
folds = None
return df_train, df_test, folds, z_idx
# Sample a dataset grouped by `groups` and stratified by `y`
# Source: https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
[docs]
def stratified_group_k_fold(X, y, groups, k, seed=None):
"""
Stratified Group K-Fold cross-validator
Provides train/test indices to split data in train/test sets.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples.
y : array-like of shape (n_samples,)
The target values.
groups : array-like of shape (n_samples,)
Group labels for the samples used while splitting the dataset into train/test set.
k : int
Number of folds. Must be at least 2.
seed : int, optional
Random seed for shuffling the data.
Yields
------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
"""
labels_num = np.max(y) + 1
y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
y_distr = Counter()
for label, g in zip(y, groups):
y_counts_per_group[g][label] += 1
y_distr[label] += 1
y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
groups_per_fold = defaultdict(set)
def eval_y_counts_per_fold(y_counts, fold):
y_counts_per_fold[fold] += y_counts
std_per_label = []
for label in range(labels_num):
label_std = np.std(
[y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]
)
std_per_label.append(label_std)
y_counts_per_fold[fold] -= y_counts
return np.mean(std_per_label)
groups_and_y_counts = list(y_counts_per_group.items())
random.Random(seed).shuffle(groups_and_y_counts)
for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
best_fold = None
min_eval = None
for i in range(k):
fold_eval = eval_y_counts_per_fold(y_counts, i)
if min_eval is None or fold_eval < min_eval:
min_eval = fold_eval
best_fold = i
y_counts_per_fold[best_fold] += y_counts
groups_per_fold[best_fold].add(g)
all_groups = set(groups)
for i in range(k):
train_groups = all_groups - groups_per_fold[i]
test_groups = groups_per_fold[i]
train_indices = [i for i, g in enumerate(groups) if g in train_groups]
test_indices = [i for i, g in enumerate(groups) if g in test_groups]
yield train_indices, test_indices
[docs]
def prepare_dataset(
rum_structure,
df_train,
num_classes,
df_test=None,
target="choice",
free_raw_data=False,
save_dataset=None,
load_dataset=None,
):
"""
Prepare and save if required the datasets for RUMBoost.
Parameters
----------
rum_structure : list of dict
The structure of the RUM model.
df_train : pandas DataFrame
The training dataset.
params : dict
The parameters of the model.
num_classes : int
The number of classes.
df_test : list of pandas DataFrame, optional
The list of test datasets.
target : str, optional
The target variable.
free_raw_data : bool, optional
If the raw data should be freed.
save_dataset : str, optional
The path to save the datasets.
load_dataset : str, optional
The path to load the datasets.
Returns
-------
train_sets : dict
The training datasets.
valid_sets : dict
The validation datasets.
"""
valid_sets = {}
num_datasets = len(rum_structure)
if load_dataset:
try:
with open(f"{load_dataset}_train_sets.pkl", "rb") as f:
train_sets = pickle.load(f)
if df_test is not None:
with open(f"{load_dataset}_valid_sets.pkl", "rb") as f:
valid_sets = pickle.load(f)
except:
raise FileNotFoundError(
"Error loading datasets, try running again this function without the load_dataset parameter."
)
train_set_J = []
reduced_valid_sets_J = []
try:
for j, _ in enumerate(rum_structure):
print(
"-" * 30
+ "\n"
+ f"[{j+1}/{num_datasets}] \t Loading dataset {j+1}..."
)
train_set_J.append(
Dataset(
data=f"{load_dataset}_train_set_{j}.bin", free_raw_data=False
)
)
if df_test is not None:
reduced_valid_sets_j = []
for i, _ in enumerate(df_test):
reduced_valid_sets_j.append(
Dataset(
data=f"{load_dataset}_valid_set_{j}_{i}.bin",
free_raw_data=False,
reference=train_set_J[j],
)
)
reduced_valid_sets_J.append(reduced_valid_sets_j)
print("\t done! \n" + "-" * 30 + "\n")
except:
raise FileNotFoundError(
"Error loading dataset, try running again this function without the load_dataset parameter."
)
train_sets["train_sets"] = train_set_J
if df_test is not None:
valid_sets["valid_sets"] = np.array(reduced_valid_sets_J).T.tolist()
return train_sets, valid_sets
labels = df_train[target].to_numpy().astype(np.int32)
labels_j = (labels[:, None] == np.array(range(num_classes))[None, :]).astype(
np.int8
)
num_obs = df_train.shape[0]
if df_test is not None:
valid_labels = []
num_obs_test = []
for df in df_test:
valid_labels += [df[target].to_numpy().astype(np.int32)]
num_obs_test += [df.shape[0]]
val_labels_j = [
(val_labs[:, None] == np.array(range(num_classes))[None, :]).astype(np.int8)
for val_labs in valid_labels
]
train_set_J = []
reduced_valid_sets_J = []
for j, struct in enumerate(rum_structure):
print("-" * 30 + "\n" + f"[{j+1}/{num_datasets}] \t Loading dataset {j+1}...")
if struct:
if "variables" in struct:
train_set_j_data = df_train[
struct["variables"]
] # only relevant features for the jth booster
if struct["shared"] == True:
new_label = labels_j[
:, struct["utility"][: len(struct["variables"])]
].reshape(-1, order="F")
feature_names = "auto"
else:
new_label = labels_j[:, 0].reshape(-1, order="F")
feature_names = struct["variables"]
train_set_j = Dataset(
train_set_j_data.values.reshape((len(new_label), -1), order="A"),
label=new_label,
free_raw_data=free_raw_data,
) # create and build dataset
categorical_feature = struct["boosting_params"].get(
"categorical_feature", "auto"
)
train_set_j._update_params(struct["boosting_params"])._set_predictor(
None
).set_feature_name(feature_names).set_categorical_feature(
categorical_feature
)
if df_test is not None:
reduced_valid_sets_j = []
for i, valid_set in enumerate(df_test):
# create and build validation sets
valid_set_j_data = valid_set[
struct["variables"]
] # only relevant features for the jth booster
if struct["shared"] == True:
label_valid = val_labels_j[i][
:, struct["utility"][: len(struct["variables"])]
].reshape(-1, order="F")
else:
label_valid = val_labels_j[i][:, 0].reshape(-1, order="F")
valid_set_j = Dataset(
valid_set_j_data.values.reshape(
(len(label_valid), -1),
order="A",
),
label=label_valid,
free_raw_data=False,
reference=train_set_j,
) # create and build dataset
valid_set_j._update_params(struct["boosting_params"])
reduced_valid_sets_j.append(valid_set_j)
if save_dataset:
valid_set_j.save_binary(
f"{save_dataset}_valid_set_{j}_{i}.bin"
)
train_set_J.append(train_set_j)
if save_dataset:
train_set_j.save_binary(f"{save_dataset}_train_set_{j}.bin")
if df_test is not None:
reduced_valid_sets_J.append(reduced_valid_sets_j)
del (
train_set_j_data,
valid_set_j_data,
train_set_j,
valid_set_j,
)
gc.collect()
print("\t done! \n" + "-" * 30 + "\n")
else:
# if no alternative specific datasets
new_label = np.where(labels == j, 1, 0)
train_set_j = Dataset(
df_train.values, label=new_label, free_raw_data=False
)
if df_test is not None:
reduced_valid_sets_j = df_test[:]
train_sets = {"num_data": num_obs, "labels": labels, "labels_j": labels_j}
if df_test is not None:
valid_sets = {
"num_data": num_obs_test,
"valid_labels": valid_labels,
}
if save_dataset:
with open(f"{save_dataset}_train_sets.pkl", "wb") as f:
pickle.dump(train_sets, f)
if df_test is not None:
with open(f"{save_dataset}_valid_sets.pkl", "wb") as f:
pickle.dump(valid_sets, f)
train_sets["train_sets"] = train_set_J
if df_test is not None:
reduced_valid_sets_J = np.array(reduced_valid_sets_J).T.tolist()
valid_sets["valid_sets"] = reduced_valid_sets_J
return train_sets, valid_sets