notebook

import%20marimo%0A%0A__generated_with%20%3D%20%220.17.8%22%0Aapp%20%3D%20marimo.App(width%3D%22medium%22)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20%23%20External%20dependencies%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20import%20pandas%20as%20pd%0A%20%20%20%20import%20seaborn%20as%20sns%0A%20%20%20%20import%20matplotlib.pylab%20as%20plt%0A%0A%20%20%20%20%23%20Internal%20dependencies%0A%20%20%20%20import%20preprocess%0A%20%20%20%20import%20process%0A%20%20%20%20import%20model_selection%0A%20%20%20%20import%20model%0A%20%20%20%20return%20mo%2C%20model%2C%20model_selection%2C%20plt%2C%20preprocess%2C%20process%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Labour%20Force%20Survey%20(LFS)%0A%0A%20%20%20%20%23%23%20Introduction%0A%0A%20%20%20%20In%20this%20notebook%2C%20we%20explore%20the%20factors%20that%20most%20strongly%20predict%20employment%20status%20using%20a%20logistic%20regression%20classifier.%20Our%20objectives%20are%3A%0A%0A%20%20%20%201.%20To%20identify%20which%20sociodemographic%20and%20economic%20features%20contribute%20most%20to%20the%20probability%20of%20being%20employed.%0A%20%20%20%202.%20To%20interpret%20these%20effects%20in%20a%20way%20that%20is%20transparent%20and%20actionable.%0A%0A%20%20%20%20We%20will%20train%2C%20evaluate%2C%20and%20interpret%20our%20model%2C%20leveraging%20feature%20importance%20methods%20and%20interpretability%20techniques.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Dataset%0A%0A%20%20%20%20This%20notebook%20presents%20an%20exploratory%20data%20analysis%20of%20the%20microdata%20from%20the%20%5Bthird%20quarter%20of%202025%20of%20the%20Labour%20Force%20Survey%5D(https%3A%2F%2Fwww.ine.es%2Fdyngs%2FPrensa%2FEPA3T25.htm)%20(LFS)%20conducted%20by%20the%20%5BSpanish%20National%20Statistics%20Institute%5D(https%3A%2F%2Fwww.ine.es)%20(INE).%20The%20main%20goal%20of%20this%20phase%20is%20to%20become%20familiar%20with%20the%20data%20structure%2C%20understand%20the%20key%20variables%2C%20and%20detect%20potential%20patterns%2C%20inconsistencies%2C%20or%20outliers%20that%20may%20influence%20the%20analysis.%0A%0A%20%20%20%20%23%23%23%20Exploratory%20Data%20Analysis%0A%0A%20%20%20%20Throughout%20the%20EDA%2C%20the%20following%20aspects%20will%20be%20addressed%3A%0A%0A%20%20%20%20*%20Review%20of%20the%20structure%20and%20coding%20of%20variables.%0A%20%20%20%20*%20Distribution%20of%20the%20population%20by%20demographic%20and%20labour%20characteristics.%0A%20%20%20%20*%20Identification%20of%20basic%20relationships%20between%20activity%20status%2C%20occupation%2C%20sector%2C%20and%20educational%20level.%0A%20%20%20%20*%20Assessment%20of%20data%20quality%3A%20missing%20values%2C%20duplicates%2C%20and%20internal%20consistency%20between%20variables.%0A%0A%20%20%20%20This%20analysis%20does%20not%20aim%20to%20draw%20final%20statistical%20conclusions%2C%20but%20rather%20to%20establish%20a%20solid%20foundation%20for%20future%20modelling%2C%20ensuring%20that%20the%20data%20are%20properly%20understood%20and%20prepared%20for%20use.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(preprocess)%3A%0A%20%20%20%20df%20%3D%20preprocess.read_dataset()%0A%20%20%20%20df.head()%0A%20%20%20%20return%20(df%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Data%20Inventory%0A%0A%20%20%20%20Although%20the%20full%20dataset%20contains%2091%20columns%2C%20we'll%20focus%20our%20analysis%20on%20a%20selection%20of%20them%2C%20which%20we%20will%20describe%20and%20document%20in%20this%20section.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Field%20%60prov%60%0A%0A%20%20%20%20The%20field%20%60prov%60%20refers%20to%20the%20province%20of%20Spain%20of%20the%20interviewed%20person.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20preprocess.map_prov(df).value_counts()%5B%3A3%5D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Field%20%60edad1%60%0A%0A%20%20%20%20The%20field%20%60edad1%60%20classifies%20the%20interviewed%20person's%20age%20on%20different%20age%20groups.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20preprocess.map_edad1(df).value_counts()%5B%3A3%5D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Field%20%60sexo1%60%0A%0A%20%20%20%20The%20field%20%60sexo1%60%20identifies%20the%20interviewed%20person%20as%20either%20man%20or%20woman.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20preprocess.map_sexo1(df).value_counts()%5B%3A3%5D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Field%20%60eciv1%60%0A%0A%20%20%20%20The%20field%20%60eciv1%60%20corresponds%20with%20the%20marital%20status.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20preprocess.map_eciv1(df).value_counts()%5B%3A3%5D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Field%20%60nforma%60%0A%0A%20%20%20%20The%20field%20%60nforma%60%20contains%20the%20educational%20level.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20preprocess.map_nforma(df).value_counts()%5B%3A3%5D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Target%20variable%20%60trarem%60%0A%0A%20%20%20%20Finally%2C%20the%20field%20%60trarem%60%20is%20our%20target%20variable%20and%20answers%20to%20the%20question%20whether%20the%20person%20did%20any%20paid%20job%20during%20the%20week%20before%20the%20interview.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20preprocess.map_trarem(df).value_counts()%5B%3A3%5D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Reduced%20Dataset%0A%0A%20%20%20%20Taking%20into%20account%20only%20the%20fields%20that%20we%20added%20to%20our%20inventory%2C%20documented%20above%2C%20we'll%20prepare%20now%20a%20%22reduced%22%20dataset%20which%20will%20also%20be%20categorized%20so%20that%20it's%20almost%20ready%20to%20be%20processed%20by%20a%20dictionary%20vectorizer.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20df_reduced%20%3D%20preprocess.reduced_dataset(df)%0A%20%20%20%20df_reduced.head()%0A%20%20%20%20return%20(df_reduced%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Empty%20Fields%0A%0A%20%20%20%20Here%20we%20analyze%20null%20values%20and%20take%20decisions%20on%20how%20to%20handle%20them.%20To%20get%20started%2C%20we%20quickly%20check%20the%20number%20of%20records%20that%20contain%20null%20values%20for%20each%20column.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df_reduced)%3A%0A%20%20%20%20df_reduced.isnull().sum()%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20From%20the%20previous%20results%2C%20it%20seems%20likely%20that%20the%20records%20missing%20%60eciv1%60%20are%20the%20same%20records%20missing%20%60nforma%60%20and%20%60trarem%60.%20Let's%20first%20check%20that.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df_reduced)%3A%0A%20%20%20%20eciv1_equals_nforma%20%3D%20(%0A%20%20%20%20%20%20%20%20df_reduced%5Bdf_reduced.eciv1.isnull()%5D.index%0A%20%20%20%20%20%20%20%20%3D%3D%20df_reduced%5Bdf_reduced.nforma.isnull()%5D.index%0A%20%20%20%20).all()%0A%0A%20%20%20%20eciv1_equals_trarem%20%3D%20(%0A%20%20%20%20%20%20%20%20df_reduced%5Bdf_reduced.eciv1.isnull()%5D.index%0A%20%20%20%20%20%20%20%20%3D%3D%20df_reduced%5Bdf_reduced.trarem.isnull()%5D.index%0A%20%20%20%20).all()%0A%0A%20%20%20%20if%20eciv1_equals_nforma%20and%20eciv1_equals_trarem%3A%0A%20%20%20%20%20%20%20%20print(%22All%20the%20nulls%20from%20evic1%20are%20the%20same%20as%20nforma%20and%20trarem%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20As%20the%20records%20are%20the%20same%2C%20checking%20which%20ones%20are%20null%20in%20one%20of%20the%20columns%20would%20suffice%3A%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df_reduced)%3A%0A%20%20%20%20df_reduced%5Bdf_reduced.eciv1.isnull()%5D.edad1.value_counts()%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20From%20the%20last%20results%20we%20can%20see%20that%20the%20only%20records%20without%20the%20variables%20%60eciv1%60%2C%20%60nforma%60%20and%20the%20target%20%60traren%60%20are%20the%20records%20that%20correspond%20to%20people%20who%20is%20less%20than%2016%20years%20old.%20We%20can%20then%20safely%20exclude%20those%20records%20from%20our%20study.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df%2C%20preprocess)%3A%0A%20%20%20%20df_filtered%20%3D%20preprocess.filtered_dataset(df)%0A%20%20%20%20df_filtered.head()%0A%20%20%20%20return%20(df_filtered%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Set%20up%20the%20Validation%20Framework%0A%0A%20%20%20%20We%20now%20need%20to%20split%20our%20dataset%20into%203%20splits%3A%20train%2C%20validation%20and%20test.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(df_filtered%2C%20process)%3A%0A%20%20%20%20df_train%2C%20df_full%2C%20df_val%2C%20df_test%20%3D%20process.split_dataset(df_filtered)%0A%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%22len(df_train)%22%3A%20len(df_train)%2C%0A%20%20%20%20%20%20%20%20%22len(df_full)%22%3A%20len(df_full)%2C%0A%20%20%20%20%20%20%20%20%22len(df_val)%22%3A%20len(df_val)%2C%0A%20%20%20%20%20%20%20%20%22len(df_test)%22%3A%20len(df_test)%2C%0A%20%20%20%20%7D%0A%20%20%20%20return%20df_full%2C%20df_train%2C%20df_val%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Dictionary%20Vectorizer%0A%0A%20%20%20%20As%20all%20our%20features%20are%20categoric%20columns%2C%20we'll%20need%20to%20prepare%20the%20data%20using%20a%20dictionary%20vectorizer.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_train%2C%20process)%3A%0A%20%20%20%20train_features%2C%20train_target%20%3D%20process.separate_features_and_target(df_train)%0A%20%20%20%20dict_vectorizer%20%3D%20process.train_dict_vectorizer(train_features)%0A%20%20%20%20return%20dict_vectorizer%2C%20train_features%2C%20train_target%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Features%20and%20Target%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_full%2C%20df_val%2C%20dict_vectorizer%2C%20process%2C%20train_features%2C%20train_target)%3A%0A%20%20%20%20X_train%20%3D%20dict_vectorizer.transform(train_features.to_dict(orient%3D%22records%22))%0A%20%20%20%20y_train%20%3D%20train_target%20%3D%3D%20%22Yes%22%0A%0A%20%20%20%20full_features%2C%20full_target%20%3D%20process.separate_features_and_target(df_full)%0A%20%20%20%20X_full%20%3D%20dict_vectorizer.transform(full_features.to_dict(orient%3D%22records%22))%0A%20%20%20%20y_full%20%3D%20full_target%20%3D%3D%20%22Yes%22%0A%0A%20%20%20%20val_features%2C%20val_target%20%3D%20process.separate_features_and_target(df_val)%0A%20%20%20%20X_val%20%3D%20dict_vectorizer.transform(val_features.to_dict(orient%3D%22records%22))%0A%20%20%20%20y_val%20%3D%20val_target%20%3D%3D%20%22Yes%22%0A%20%20%20%20return%20X_full%2C%20X_train%2C%20X_val%2C%20y_full%2C%20y_train%2C%20y_val%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Random%20Forest%0A%0A%20%20%20%20Our%20goal%20with%20this%20dataset%20is%20not%20be%20able%20to%20predict%20if%20someone%20will%20have%20an%20occupation%20or%20not%20depending%20on%20certain%20parameters%2C%20but%20to%20understand%20what%20parameters%20correlate%20better%20with%20the%20fact%20that%20someone%20has%20an%20occupation%20instead.%20So%20we'll%20focus%20on%20interpretable%20(decision-tree%20based)%20models.%20Let's%20start%20by%20training%20a%20random%20forest.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(X_train%2C%20model_selection%2C%20y_train)%3A%0A%20%20%20%20from%20time%20import%20time%2C%20sleep%0A%0A%20%20%20%20random_forest_param_distributions%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22n_estimators%22%3A%20%5B50%2C%20100%2C%20200%5D%2C%0A%20%20%20%20%20%20%20%20%22max_depth%22%3A%20%5BNone%2C%201%2C%202%2C%203%2C%205%2C%2010%2C%2025%2C%2050%5D%2C%0A%20%20%20%20%20%20%20%20%22min_samples_leaf%22%3A%20%5B1%2C%2025%2C%2050%2C%20100%2C%20500%2C%201000%5D%2C%0A%20%20%20%20%7D%0A%0A%20%20%20%20start_time%20%3D%20time()%0A%20%20%20%20random_forest_search%20%3D%20model_selection.search_random_forest(X_train%2C%20y_train%2C%20random_forest_param_distributions)%0A%20%20%20%20end_time%20%3D%20time()%0A%0A%20%20%20%20print(%22Execution%20time%3A%20%25s%20s%22%20%25%20int(end_time%20-%20start_time))%0A%20%20%20%20return%20end_time%2C%20random_forest_search%2C%20start_time%2C%20time%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Best%20Params%0A%0A%20%20%20%20Now%2C%20let's%20examine%20the%20results%20of%20each%20experiment%20to%20obtain%20the%20parameters%20of%20the%20best%20random%20forest%20model.%20First%2C%20we'll%20check%20each%20of%20the%20experiments%20looking%20at%20its%20mean%20score%20and%20fit%20time%20highlighting%20in%20green%20the%20winner%20experiment.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(model_selection%2C%20random_forest_search)%3A%0A%20%20%20%20model_selection.plot_experiments(random_forest_search)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(model_selection%2C%20random_forest_search)%3A%0A%20%20%20%20model_selection.plot_random_forest_parameters(random_forest_search)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(random_forest_search)%3A%0A%20%20%20%20random_forest_search.best_params_%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Evaluation%0A%0A%20%20%20%20With%20this%20params%20we'll%20now%20train%20an%20evaluate%20a%20Random%20Forest%20model%20using%20the%20full%20(train%20%2B%20validation)%20dataset.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(X_full%2C%20X_val%2C%20model_selection%2C%20random_forest_search%2C%20y_full%2C%20y_val)%3A%0A%20%20%20%20optimized_random_forest%20%3D%20model_selection.train_random_forest(X_full%2C%20y_full%2C%20param_distributions%3Drandom_forest_search.best_params_)%0A%0A%20%20%20%20print(%22Randomized%20search%20score%3A%20%25.2f%20%25%25%22%20%25%20(model_selection.eval_model(X_val%2C%20y_val%2C%20optimized_random_forest)%20*%20100))%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20XGBoost%0A%0A%20%20%20%20At%20this%20point%2C%20we%20want%20to%20check%20if%20with%20XGBoost%20we%20would%20achieve%20better%20results%20than%20the%20ones%20we%20obtained%20with%20random%20forest.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(%0A%20%20%20%20X_train%2C%0A%20%20%20%20X_val%2C%0A%20%20%20%20end_time%2C%0A%20%20%20%20model_selection%2C%0A%20%20%20%20start_time%2C%0A%20%20%20%20time%2C%0A%20%20%20%20y_train%2C%0A%20%20%20%20y_val%2C%0A)%3A%0A%20%20%20%20import%20xgboost%20as%20xgb%0A%0A%20%20%20%20booster_param_distributions%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22eta%22%3A%20%5B0.1%2C%200.2%2C%200.3%2C%201.0%5D%2C%0A%20%20%20%20%20%20%20%20%22max_depth%22%3A%20%5B5%2C%2025%2C%2050%5D%2C%0A%20%20%20%20%20%20%20%20%22min_child_weight%22%3A%20%5B1%2C%203%2C%205%2C%207%5D%2C%0A%20%20%20%20%7D%0A%0A%20%20%20%20booster_start_time%20%3D%20time()%0A%20%20%20%20booster_search%20%3D%20model_selection.search_xgboost(X_train%2C%20y_train%2C%20X_val%2C%20y_val%2C%20booster_param_distributions)%0A%20%20%20%20booster_end_time%20%3D%20time()%0A%0A%20%20%20%20print(%22Execution%20time%3A%20%25s%20s%22%20%25%20int(end_time%20-%20start_time))%0A%20%20%20%20return%20(booster_search%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Best%20Params%0A%0A%20%20%20%20As%20we%20did%20before%2C%20we'll%20now%20examine%20the%20results%20of%20each%20experiment%20to%20obtain%20the%20best%20parameters%20for%20the%20XGBoost%20model.%20First%2C%20we'll%20check%20each%20of%20the%20experiments%20looking%20at%20its%20mean%20score%20and%20fit%20time%20highlighting%20in%20green%20the%20winner%20experiment.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(booster_search%2C%20model_selection)%3A%0A%20%20%20%20model_selection.plot_experiments(booster_search)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(booster_search%2C%20model_selection)%3A%0A%20%20%20%20model_selection.plot_xgboost_parameters(booster_search)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(booster_search)%3A%0A%20%20%20%20booster_search.best_params_%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Evaluation%0A%0A%20%20%20%20With%20this%20params%20we'll%20now%20train%20an%20evaluate%20a%20XGDBooster%20model%20using%20the%20full%20(train%20%2B%20validation)%20dataset.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20X_full%2C%0A%20%20%20%20X_val%2C%0A%20%20%20%20booster_search%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20model_selection%2C%0A%20%20%20%20y_full%2C%0A%20%20%20%20y_val%2C%0A)%3A%0A%20%20%20%20optimized_booster%20%3D%20model_selection.train_booster(X_full%2C%20y_full%2C%20param_distributions%3Dbooster_search.best_params_)%0A%20%20%20%20optimized_booster.get_booster().feature_names%20%3D%20dict_vectorizer.get_feature_names_out().tolist()%0A%0A%20%20%20%20print(%22XGBoost%20search%20score%3A%20%25.2f%20%25%25%22%20%25%20(model_selection.eval_model(X_val%2C%20y_val%2C%20optimized_booster)%20*%20100))%0A%20%20%20%20return%20(optimized_booster%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Model%20Selection%0A%0A%20%20%20%20As%20we%20obtain%20significatly%20better%20results%20with%20XGBoost%2C%20we'll%20now%20save%20our%20final%20model%20for%20later%20use.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(optimized_booster)%3A%0A%20%20%20%20optimized_booster.save_model(%22booster_model.json%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Inspection%0A%0A%20%20%20%20As%20we%20chose%20a%20decision%20based%20model%2C%20we%20can%20now%20inspect%20it%20to%20see%20what%20features%20correlate%20better%20with%20the%20target%20variable%20(having%20a%20paid%20occupation).%20The%20most%20influential%20features%20in%20the%20model%20are%3A%0A%0A%20%20%20%20*%20**sexo1%3DMan**%3A%20This%20is%20the%20single%20most%20important%20feature.%20Being%20a%20man%20is%2C%20by%20far%2C%20the%20most%20effective%20feature%20for%20making%20splits%20that%20accurately%20predict%20the%20target%20variable.%20Which%2C%20as%20a%20recall%2C%20identifies%20whether%20the%20interviewed%20person%20had%20a%20paid%20occupation%20the%20week%20before%20the%20interview%20happened.%20This%20suggests%20a%20strong%20correlation%20between%20being%20male%20and%20having%20an%20occupation.%0A%0A%20%20%20%20*%20**nforma%3DHigher%20education**%3A%20This%20is%20the%20second%20most%20important%20feature.%20Having%20a%20higher%20education%20level%20is%20extremely%20predictive%2C%20likely%20indicating%20a%20strong%20positive%20association%20with%20being%20employed.%0A%0A%20%20%20%20*%20**eciv1%3DMarried**%3A%20Being%20married%20is%20also%20a%20very%20influential%20feature%2C%20possibly%20acting%20as%20a%20proxy%20for%20factors%20like%20age%20or%20stable%20employment%20history.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(optimized_booster%2C%20plt)%3A%0A%20%20%20%20from%20xgboost%20import%20plot_importance%0A%0A%20%20%20%20plot_importance(%0A%20%20%20%20%20%20%20%20optimized_booster.get_booster()%2C%0A%20%20%20%20%20%20%20%20max_num_features%3D3%2C%0A%20%20%20%20%20%20%20%20height%3D.7%2C%0A%20%20%20%20)%0A%0A%20%20%20%20plt.show()%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Shapley%20Additive%20Explanations%0A%0A%20%20%20%20To%20deepen%20our%20understanding%20of%20how%20each%20feature%20influences%20the%20model%E2%80%99s%20predictions%2C%20we%20apply%20SHAP%2C%20which%20decomposes%20each%20individual%20prediction%20into%20contributions%20from%20each%20feature%2C%20helping%20us%20see%20not%20only%20which%20features%20are%20most%20important%20globally%2C%20but%20also%20how%20they%20push%20the%20prediction%20for%20individual%20cases.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(X_train%2C%20dict_vectorizer%2C%20optimized_booster)%3A%0A%20%20%20%20import%20shap%0A%0A%20%20%20%20explainer%20%3D%20shap.TreeExplainer(%0A%20%20%20%20%20%20%20%20optimized_booster%2C%0A%20%20%20%20%20%20%20%20feature_names%3Ddict_vectorizer.get_feature_names_out().tolist()%2C%0A%20%20%20%20)%0A%0A%20%20%20%20samples%20%3D%20shap.sample(X_train%2C%20nsamples%3D500)%0A%20%20%20%20shap_values%20%3D%20explainer(samples%2C%20check_additivity%3DFalse)%0A%0A%20%20%20%20def%20plot_shap_per_feature(feature_prefix%3A%20str)%3A%0A%20%20%20%20%20%20%20%20feature_names%20%3D%20explainer.feature_names%0A%20%20%20%20%20%20%20%20indices%20%3D%20%5Bi%20for%20i%2C%20name%20in%20enumerate(feature_names)%20if%20name.startswith(feature_prefix)%5D%0A%20%20%20%20%20%20%20%20features%20%3D%20%5Bfeature_names%5Bi%5D%20for%20i%20in%20indices%5D%0A%0A%20%20%20%20%20%20%20%20filtered_shap_values%20%3D%20shap_values%5B%3A%2C%20indices%5D%0A%20%20%20%20%20%20%20%20filtered_samples%20%3D%20samples%5B%3A%2C%20indices%5D%0A%0A%20%20%20%20%20%20%20%20shap.summary_plot(filtered_shap_values%2C%20filtered_samples%2C%20feature_names%3Dfeatures%2C%20max_display%3D100%2C%20cmap%3D%22cividis%22%2C%20alpha%3D0.5)%0A%20%20%20%20return%20(plot_shap_per_feature%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Per%20Sex%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22sexo1%3DMan%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Per%20Education%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22nforma%3DHigher%20education%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22nforma%3DPrimary%20education%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Per%20Age%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22edad1%3D16%20to%2019%20years%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22edad1%3D65%20or%20more%20years%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Per%20Marital%20Status%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22eciv1%3DSingle%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22eciv1%3DMarried%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Per%20Province%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(plot_shap_per_feature)%3A%0A%20%20%20%20plot_shap_per_feature(%22prov%3D%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Publishing%20the%20Model%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(X_val%2C%20dict_vectorizer%2C%20model%2C%20model_selection%2C%20y_val)%3A%0A%20%20%20%20params%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22min_child_weight%22%3A%203%2C%0A%20%20%20%20%20%20%20%20%22max_depth%22%3A%205%2C%0A%20%20%20%20%20%20%20%20%22eta%22%3A%201.0%0A%20%20%20%20%7D%0A%0A%20%20%20%20loaded_booster%20%3D%20model.load_booster(%22booster_model.json%22%2C%20params)%0A%20%20%20%20loaded_booster.get_booster().feature_names%20%3D%20dict_vectorizer.get_feature_names_out().tolist()%0A%0A%20%20%20%20print(%22Loaded%20model%20score%3A%20%25.2f%20%25%25%22%20%25%20(model_selection.eval_model(X_val%2C%20y_val%2C%20loaded_booster)%20*%20100))%0A%20%20%20%20return%20(loaded_booster%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Predictions%0A%0A%20%20%20%20The%20code%20below%20shows%20how%20we'd%20proceed%20to%20use%20the%20model%20in%20order%20to%20make%20predictions.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(dict_vectorizer%2C%20loaded_booster)%3A%0A%20%20%20%20tests%20%3D%20%5B%0A%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22prov%22%3A%20%22Madrid%22%2C%0A%20%20%20%20%20%20%20%20%09%22edad1%22%3A%20%2240%20to%2044%20years%22%2C%0A%20%20%20%20%20%20%20%20%09%22sexo1%22%3A%20%22Man%22%2C%0A%20%20%20%20%20%20%20%20%09%22eciv1%22%3A%20%22Married%22%2C%0A%20%20%20%20%20%20%20%20%09%22nforma%22%3A%20%22Higher%20education%22%2C%0A%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22prov%22%3A%20%22Santa%20Cruz%20de%20Tenerife%22%2C%0A%20%20%20%20%20%20%20%20%09%22edad1%22%3A%20%2250%20to%2054%20years%22%2C%0A%20%20%20%20%20%20%20%20%09%22sexo1%22%3A%20%22Woman%22%2C%0A%20%20%20%20%20%20%20%20%09%22eciv1%22%3A%20%22Single%22%2C%0A%20%20%20%20%20%20%20%20%09%22nforma%22%3A%20%22Upper%20secondary%20education%20%E2%80%94%20vocational%20track%22%2C%0A%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%5D%0A%0A%20%20%20%20X_test%20%3D%20dict_vectorizer.transform(tests)%0A%20%20%20%20loaded_booster.predict(X_test)%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A