notes

import%20marimo%0A%0A__generated_with%20%3D%20%220.16.0%22%0Aapp%20%3D%20marimo.App(width%3D%22medium%22)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20import%20pandas%20as%20pd%0A%20%20%20%20import%20numpy%20as%20np%0A%20%20%20%20import%20matplotlib.pyplot%20as%20plt%0A%20%20%20%20import%20seaborn%20as%20sns%0A%20%20%20%20return%20mo%2C%20np%2C%20pd%2C%20plt%2C%20sns%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%20Machine%20Learning%20Zoomcamp%0A%0A%20%20%20%20%23%23%20Module%206%3A%20**Decision%20Trees**%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(pd)%3A%0A%20%20%20%20repository_root%20%3D%20(%0A%20%20%20%20%20%20%20%20%22https%3A%2F%2Fgithub.com%2FDataTalksClub%2Fmachine-learning-zoomcamp%2Fblob%2Fmaster%2F%22%0A%20%20%20%20)%0A%0A%20%20%20%20chapters%20%3D%20pd.DataFrame(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Credit%20Risk%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22GJGmlfZoCoU%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F01-credit-risk.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Data%20Preparation%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22tfuQdI3YO2c%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F02-data-prep.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Decision%20Trees%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22YGiQvFbSIg8%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F03-decision-trees.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Decision%20Tree%20Learning%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22XODz6LwKY7g%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F04-decision-tree-learning.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Decision%20Tree%20Tuning%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22XJaxwH50Qok%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F05-decision-tree-tuning.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Random%20Forest%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22FZhcmOfNNZE%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F06-random-forest.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Gradient%20boosting%20and%20XGBoost%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22xFarGClszEM%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F07-boosting.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22XGBoost%20Parameter%20Tuning%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22VX6ftRzYROM%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F08-xgb-tuning.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Final%20Model%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22lqdnyIVQq-M%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F09-final-model.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Summary%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22youtube_id%22%3A%20%22JZ6sRZ_5j_c%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F10-summary.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22title%22%3A%20%22Explore%20More%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22contents%22%3A%20repository_root%20%2B%20%2206-trees%2F11-explore-more.md%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%0A%20%20%20%20chapters.insert(%0A%20%20%20%20%20%20%20%20loc%3D0%2C%0A%20%20%20%20%20%20%20%20column%3D%22snapshot%22%2C%0A%20%20%20%20%20%20%20%20value%3D%22https%3A%2F%2Fimg.youtube.com%2Fvi%2F%22%0A%20%20%20%20%20%20%20%20%2B%20chapters.youtube_id.astype(str)%0A%20%20%20%20%20%20%20%20%2B%20%22%2Fhqdefault.jpg%22%2C%0A%20%20%20%20)%0A%20%20%20%20chapters.insert(%0A%20%20%20%20%20%20%20%20loc%3D2%2C%0A%20%20%20%20%20%20%20%20column%3D%22youtube%22%2C%0A%20%20%20%20%20%20%20%20value%3D%22https%3A%2F%2Fyoutube.com%2Fwatch%3Fv%3D%22%20%2B%20chapters.youtube_id.astype(str)%2C%0A%20%20%20%20)%0A%0A%20%20%20%20videos%20%3D%20chapters%5Bchapters%5B%22youtube_id%22%5D.notnull()%5D%0A%20%20%20%20videos%5B%5B%22snapshot%22%2C%20%22title%22%2C%20%22youtube%22%5D%5D%0A%20%20%20%20return%20(chapters%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(chapters)%3A%0A%20%20%20%20contents%20%3D%20chapters%5Bchapters%5B%22contents%22%5D.notnull()%5D%0A%20%20%20%20contents%5B%5B%22title%22%2C%20%22contents%22%5D%5D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Credit%20Risk%0A%0A%20%20%20%20In%20this%20module%20we'll%20be%20creating%20a%20credit%20risk%20scoring%20model.%20To%20do%20so%2C%20we'll%20be%20using%20a%20specific%20%5BCreditScoring%5D(https%3A%2F%2Fgithub.com%2Fgastonstat%2FCreditScoring)%20dataset%20that%20has%20been%20copied%20into%20the%20data%20folder%20for%20easier%20access%20and%20loading.%0A%0A%20%20%20%20The%20model%20will%20decide%20whether%20a%20client%20is%20likely%20to%20return%20a%20credit%3A%0A%0A%20%20%20%20-%20if%20the%20model%20returns%200%2C%20the%20client%20is%20very%20likely%20to%20payback%20and%20the%20loan%20is%20approved%0A%20%20%20%20-%20if%20the%20model%20returns%201%2C%20the%20client%20is%20not%20likely%20to%20payback%20and%20the%20loan%20is%20rejected%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(pd)%3A%0A%20%20%20%20df_raw%20%3D%20pd.read_csv(%22module-6%2Fdata%2FCreditScoring.csv%22)%0A%0A%20%20%20%20df_raw.head()%0A%20%20%20%20return%20(df_raw%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Data%20Cleaning%20and%20Preparation%0A%0A%20%20%20%20%23%23%23%20Quick%20Look%20at%20the%20Data%20Types%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_raw)%3A%0A%20%20%20%20df_raw.dtypes%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%20Normalize%20Column%20Names%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_raw%2C%20pd)%3A%0A%20%20%20%20def%20normalize_column_names(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20copy%20%3D%20df.copy()%0A%20%20%20%20%20%20%20%20copy.columns%20%3D%20copy.columns.str.lower()%0A%0A%20%20%20%20%20%20%20%20return%20copy%0A%0A%20%20%20%20def%20normalize_column_names_preview(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20df_normalized%20%3D%20normalize_column_names(df_raw)%0A%0A%20%20%20%20%20%20%20%20return%20df_normalized.head()%0A%0A%20%20%20%20normalize_column_names_preview(df_raw)%0A%20%20%20%20return%20(normalize_column_names%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%20Set%20Category%20Names%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_raw%2C%20normalize_column_names%2C%20pd)%3A%0A%20%20%20%20def%20set_category_names(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20copy%20%3D%20df.copy()%0A%20%20%20%20%20%20%20%20copy.status%20%3D%20copy.status.map(%7B1%3A%20'ok'%2C%202%3A%20'default'%2C%200%3A%20'unknown'%7D)%0A%20%20%20%20%20%20%20%20copy.home%20%3D%20copy.home.map(%7B1%3A%20'rent'%2C%202%3A%20'owner'%2C%203%3A%20'private'%2C%204%3A%20'ignore'%2C%205%3A%20'parents'%2C%206%3A%20'other'%2C%200%3A%20'unknown'%2C%7D)%0A%20%20%20%20%20%20%20%20copy.marital%20%3D%20copy.marital.map(%7B1%3A%20'single'%2C%202%3A%20'married'%2C%203%3A%20'widow'%2C%204%3A%20'separated'%2C%205%3A%20'divorced'%2C%200%3A%20'unknown'%7D)%0A%20%20%20%20%20%20%20%20copy.records%20%3D%20copy.records.map(%7B1%3A%20'no'%2C%202%3A%20'yes'%2C%200%3A%20'unknown'%7D)%0A%20%20%20%20%20%20%20%20copy.job%20%3D%20copy.job.map(%7B1%3A%20'fixed'%2C%202%3A%20'partime'%2C%203%3A%20'freelance'%2C%204%3A%20'others'%2C%200%3A%20'unknown'%7D)%0A%0A%20%20%20%20%20%20%20%20return%20copy%0A%0A%20%20%20%20def%20set_category_names_preview(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20df_normalized%20%3D%20normalize_column_names(df_raw)%0A%20%20%20%20%20%20%20%20df_categorized%20%3D%20set_category_names(df_normalized)%0A%0A%20%20%20%20%20%20%20%20return%20df_categorized.head()%0A%0A%20%20%20%20set_category_names_preview(df_raw)%0A%20%20%20%20return%20(set_category_names%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%20Handle%20Missing%20Values%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_raw)%3A%0A%20%20%20%20df_raw.describe().round()%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_raw%2C%20normalize_column_names%2C%20np%2C%20pd%2C%20set_category_names)%3A%0A%20%20%20%20def%20tag_missing_values(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20copy%20%3D%20df.copy()%0A%20%20%20%20%20%20%20%20copy.income%20%3D%20copy.income.replace(to_replace%3D99999999%2C%20value%3Dnp.nan)%0A%20%20%20%20%20%20%20%20copy.assets%20%3D%20copy.assets.replace(to_replace%3D99999999%2C%20value%3Dnp.nan)%0A%20%20%20%20%20%20%20%20copy.debt%20%3D%20copy.debt.replace(to_replace%3D99999999%2C%20value%3Dnp.nan)%0A%0A%20%20%20%20%20%20%20%20return%20copy%0A%0A%20%20%20%20def%20tag_missing_values_preview(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20df_normalized%20%3D%20normalize_column_names(df_raw)%0A%20%20%20%20%20%20%20%20df_categorized%20%3D%20set_category_names(df_normalized)%0A%20%20%20%20%20%20%20%20df_missing%20%3D%20tag_missing_values(df_categorized)%0A%0A%20%20%20%20%20%20%20%20return%20df_missing.describe()%0A%0A%20%20%20%20tag_missing_values_preview(df_raw)%0A%20%20%20%20return%20(tag_missing_values%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Filter%20Target%20Variable%0A%0A%20%20%20%20We%20can%20only%20make%20use%20of%20the%20part%20of%20the%20dataset%20that%20has%20clearly%20defined%20outcomes%2C%20but%20the%20target%20variable%20contains%20records%20where%20it's%20unknown.%20We'll%20now%20filter%20those%20records.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_raw%2C%0A%20%20%20%20normalize_column_names%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20set_category_names%2C%0A%20%20%20%20tag_missing_values%2C%0A)%3A%0A%20%20%20%20def%20filter_target(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20copy%20%3D%20df.copy()%0A%20%20%20%20%20%20%20%20copy%20%3D%20copy%5Bcopy.status%20!%3D%20'unknown'%5D.reset_index(drop%3DTrue)%0A%0A%20%20%20%20%20%20%20%20return%20copy%0A%0A%20%20%20%20def%20filter_target_preview(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20df_normalized%20%3D%20normalize_column_names(df_raw)%0A%20%20%20%20%20%20%20%20df_categorized%20%3D%20set_category_names(df_normalized)%0A%20%20%20%20%20%20%20%20df_missing%20%3D%20tag_missing_values(df_categorized)%0A%20%20%20%20%20%20%20%20df_filtered%20%3D%20filter_target(df_missing)%0A%0A%20%20%20%20%20%20%20%20return%20df_filtered.head()%0A%0A%20%20%20%20filter_target_preview(df_raw)%0A%20%20%20%20return%20(filter_target%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%20Set%20up%20the%20Validation%20Framework%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_raw%2C%0A%20%20%20%20filter_target%2C%0A%20%20%20%20normalize_column_names%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20set_category_names%2C%0A%20%20%20%20tag_missing_values%2C%0A)%3A%0A%20%20%20%20from%20sklearn.model_selection%20import%20train_test_split%0A%0A%20%20%20%20def%20preprocess(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20df_normalized%20%3D%20normalize_column_names(df_raw)%0A%20%20%20%20%20%20%20%20df_categorized%20%3D%20set_category_names(df_normalized)%0A%20%20%20%20%20%20%20%20df_missing%20%3D%20tag_missing_values(df_categorized)%0A%0A%20%20%20%20%20%20%20%20return%20filter_target(df_missing)%0A%0A%20%20%20%20def%20split(df%3A%20pd.DataFrame)%20-%3E%20(pd.DataFrame%2C%20pd.DataFrame%2C%20pd.DataFrame%2C%20pd.DataFrame)%3A%0A%20%20%20%20%20%20%20%20df_full%2C%20df_test%20%3D%20train_test_split(df%2C%20test_size%3D0.2%2C%20random_state%3D11)%0A%20%20%20%20%20%20%20%20df_train%2C%20df_val%20%3D%20train_test_split(df_full%2C%20test_size%3D0.25%2C%20random_state%3D11)%0A%0A%20%20%20%20%20%20%20%20df_full%20%3D%20df_full.reset_index(drop%3DTrue)%0A%20%20%20%20%20%20%20%20df_train%20%3D%20df_train.reset_index(drop%3DTrue)%0A%20%20%20%20%20%20%20%20df_val%20%3D%20df_val.reset_index(drop%3DTrue)%0A%20%20%20%20%20%20%20%20df_test%20%3D%20df_test.reset_index(drop%3DTrue)%0A%0A%20%20%20%20%20%20%20%20return%20df_full%2C%20df_train%2C%20df_val%2C%20df_test%0A%0A%20%20%20%20df_full%2C%20df_train%2C%20df_val%2C%20df_test%20%3D%20split(preprocess(df_raw))%0A%20%20%20%20return%20df_full%2C%20df_test%2C%20df_train%2C%20df_val%0A%0A%0A%40app.cell%0Adef%20_(df_full%2C%20pd)%3A%0A%20%20%20%20def%20separate_target(df%3A%20pd.DataFrame)%20-%3E%20pd.DataFrame%3A%0A%20%20%20%20%20%20%20%20features%20%3D%20df.copy()%0A%20%20%20%20%20%20%20%20target%20%3D%20(df.status%20%3D%3D%20'default').astype(int).values%0A%20%20%20%20%20%20%20%20del%20features%5B%22status%22%5D%0A%0A%20%20%20%20%20%20%20%20return%20features%2C%20target%0A%0A%20%20%20%20separate_target(df_full)%0A%20%20%20%20return%20(separate_target%2C)%0A%0A%0A%40app.cell%0Adef%20_(df_full%2C%20pd%2C%20separate_target)%3A%0A%20%20%20%20from%20typing%20import%20Optional%0A%20%20%20%20from%20sklearn.feature_extraction%20import%20DictVectorizer%0A%0A%20%20%20%20def%20train_dictionary_vectorizer(df%3A%20pd.DataFrame)%20-%3E%20(DictVectorizer%2C%20dict)%3A%0A%20%20%20%20%20%20%20%20dictionary%20%3D%20df.to_dict(orient%3D'records')%0A%20%20%20%20%20%20%20%20dict_vectorizer%20%3D%20DictVectorizer(sparse%3DFalse)%0A%20%20%20%20%20%20%20%20X%20%3D%20dict_vectorizer.fit_transform(dictionary)%0A%0A%20%20%20%20%20%20%20%20return%20dict_vectorizer%2C%20X%0A%0A%20%20%20%20def%20get_features_and_target(df%3A%20pd.DataFrame%2C%20dict_vectorizer%3A%20Optional%5BDictVectorizer%5D%20%3D%20None)%20%5C%0A%20%20%20%20%20%20%20%20-%3E%20(pd.DataFrame%2C%20pd.DataFrame%2C%20DictVectorizer%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20features%2C%20y%20%3D%20separate_target(df)%0A%0A%20%20%20%20%20%20%20%20if%20not%20dict_vectorizer%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20dict_vectorizer%2C%20X%20%3D%20train_dictionary_vectorizer(features)%0A%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20X%20%3D%20dict_vectorizer.transform(features.to_dict(orient%3D'records'))%0A%0A%20%20%20%20%20%20%20%20return%20X%2C%20y%2C%20dict_vectorizer%0A%0A%20%20%20%20get_features_and_target(df_full)%0A%20%20%20%20return%20DictVectorizer%2C%20Optional%2C%20get_features_and_target%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Decision%20Tree%0A%0A%20%20%20%20Decision%20trees%20are%20a%20data%20structure%20that%20encodes%20information%20about%20a%20dataset%20in%20the%20form%20of%20conditions%20(if%20statements).%20Each%20of%20the%20conditions%20typically%20relates%20with%20a%20field%20from%20the%20dataset%2C%20a%20comparison%20symbol%20(%3C%2C%20%3C%3D%2C%20%3E%3D%2C%20%3E)%20and%20a%20value.%20From%20each%20node%2C%20two%20branches%20are%20maintained%20to%20the%20records%20that%20match%20the%20condition%20and%20the%20records%20that%20don't%20match%20it.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(DictVectorizer%2C%20df_train%2C%20get_features_and_target%2C%20pd)%3A%0A%20%20%20%20from%20sklearn.tree%20import%20DecisionTreeClassifier%0A%0A%20%20%20%20def%20train_decision_tree(df%3A%20pd.DataFrame%2C%20max_depth%3A%20int%20%3D%20None%2C%20min_samples_leaf%3A%20int%20%3D%201)%20%5C%0A%20%20%20%20%20%20%20%20-%3E%20(DecisionTreeClassifier%2C%20DictVectorizer%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20decision_tree%20%3D%20DecisionTreeClassifier(max_depth%3Dmax_depth%2C%20min_samples_leaf%3Dmin_samples_leaf)%0A%20%20%20%20%20%20%20%20X%2C%20y%2C%20dict_vectorizer%20%3D%20get_features_and_target(df)%0A%20%20%20%20%20%20%20%20decision_tree.fit(X%2C%20y)%0A%0A%20%20%20%20%20%20%20%20return%20decision_tree%2C%20dict_vectorizer%0A%0A%20%20%20%20overfitted_decision_tree%2C%20dict_vectorizer%20%3D%20train_decision_tree(df_train)%0A%20%20%20%20return%20dict_vectorizer%2C%20overfitted_decision_tree%2C%20train_decision_tree%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%20Evaluate%20the%20model%0A%0A%20%20%20%20At%20the%20moment%2C%20we%20have%20100%25%20of%20accuracy%20on%20the%20train%20set%20but%20an%20accuracy%20of%20around%2066%25%20on%20our%20validation%20set.%20We%20are%20**overfitting**%20our%20model.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20DictVectorizer%2C%0A%20%20%20%20df_train%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_features_and_target%2C%0A%20%20%20%20overfitted_decision_tree%2C%0A%20%20%20%20pd%2C%0A)%3A%0A%20%20%20%20from%20sklearn.metrics%20import%20roc_auc_score%0A%20%20%20%20from%20sklearn.base%20import%20ClassifierMixin%0A%0A%20%20%20%20def%20get_roc_auc_score(df%3A%20pd.DataFrame%2C%20dict_vectorizer%3A%20DictVectorizer%2C%20model%3A%20ClassifierMixin)%20-%3E%20float%3A%0A%20%20%20%20%20%20%20%20X%2C%20y%2C%20_%20%3D%20get_features_and_target(df%2C%20dict_vectorizer)%0A%20%20%20%20%20%20%20%20y_pred%20%3D%20model.predict_proba(X)%5B%3A%2C1%5D%0A%0A%20%20%20%20%20%20%20%20return%20roc_auc_score(y%2C%20y_pred)%0A%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%22roc_auc_val%22%3A%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20overfitted_decision_tree)%2C%0A%20%20%20%20%20%20%20%20%22roc_auc_train%22%3A%20get_roc_auc_score(df_train%2C%20dict_vectorizer%2C%20overfitted_decision_tree)%0A%20%20%20%20%7D%0A%20%20%20%20return%20get_roc_auc_score%2C%20roc_auc_score%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22By%20default%2C%20decision%20trees%20can%20grow%20as%20much%20as%20they%20want%2C%20what%20makes%20them%20prone%20to%20overfitting.%20To%20address%20this%2C%20we%20can%20set%20a%20maximum%20depth%20when%20we%20create%20them.%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_train%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_roc_auc_score%2C%0A%20%20%20%20train_decision_tree%2C%0A)%3A%0A%20%20%20%20decision_tree%2C%20_%20%3D%20train_decision_tree(df_train%2C%20max_depth%3D3)%0A%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%22roc_auc_val%22%3A%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20decision_tree)%2C%0A%20%20%20%20%20%20%20%20%22roc_auc_train%22%3A%20get_roc_auc_score(df_train%2C%20dict_vectorizer%2C%20decision_tree)%0A%20%20%20%20%7D%0A%20%20%20%20return%20(decision_tree%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Explore%20the%20tree%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(decision_tree%2C%20dict_vectorizer%2C%20plt)%3A%0A%20%20%20%20from%20sklearn.tree%20import%20plot_tree%0A%0A%20%20%20%20plt.figure(figsize%3D(16%2C%209))%0A%20%20%20%20plot_tree(decision_tree%2C%20feature_names%3Ddict_vectorizer.get_feature_names_out()%2C%20fontsize%3D10)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Decision%20Tree%20Learning%0A%0A%20%20%20%20A%20decision%20tree%20is%20a%20model%20that%20makes%20predictions%20by%20following%20a%20series%20of%20yes%2Fno%20questions%20based%20on%20the%20features%20of%20the%20data.%20Each%20internal%20node%20represents%20a%20question%20(for%20example%2C%20**income%20%3C%3D%2074.5**)%2C%20each%20branch%20represents%20an%20answer%20(*yes*%20or%20*no*)%20and%20each%20leaf%20node%20gives%20the%20final%20decision%20or%20prediction.%0A%0A%20%20%20%20The%20tree%20learns%20these%20questions%20automatically%20from%20training%20data%20by%20finding%2C%20at%20each%20step%2C%20the%20feature%20and%20threshold%20that%20best%20separate%20the%20examples%20into%20groups%20that%20are%20as%20homogeneous%20as%20possible%20with%20respect%20to%20the%20target%20(for%20instance%3A%20all%20*yes*%20or%20all%20*no*).%0A%0A%20%20%20%20This%20process%20continues%20recursively%20until%20the%20data%20are%20well%20classified%20or%20other%20stopping%20conditions%20are%20met%2C%20producing%20a%20model%20that%20can%20later%20be%20used%20to%20classify%20new%2C%20unseen%20examples%20by%20following%20the%20same%20sequence%20of%20decisions.%0A%0A%20%20%20%20%23%23%23%20Simplified%20algorithm%0A%0A%20%20%20%20This%20pseudocode%20shows%20how%20we%20could%20implement%20an%20algorithm%20that%20finds%20the%20best%20split.%0A%0A%20%20%20%20%60%60%60python%0A%20%20%20%20decision_tree%20%3D%20%7B%7D%0A%0A%20%20%20%20for%20feature%20in%20features%3A%0A%20%20%20%20%20%20%20%20thresholds%20%3D%20find_all_thresholds(feature)%0A%20%20%20%20%20%20%20%20impurities%20%3D%20%7B%7D%0A%0A%20%20%20%20%20%20%20%20for%20threshold%20in%20thresholds%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20condition%20%3D%20define_condition(feature%2C%20'%3C%3D'%2C%20threshold)%0A%20%20%20%20%20%20%20%20%20%20%20%20splitted_dataset%20%3D%20split(condition)%0A%20%20%20%20%20%20%20%20%20%20%20%20impurity%20%3D%20compute_impurity(splitted_dataset)%0A%20%20%20%20%20%20%20%20%20%20%20%20impurities%5Bcondition%5D%20%3D%20impurity%0A%0A%20%20%20%20%20%20%20%20decision_tree%5Bfeature%5D%20%3D%20select_condition_with_lowest_impurity(impurities)%0A%20%20%20%20%60%60%60%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%20Decision%20Tree%20Tuning%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_train%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_roc_auc_score%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20train_decision_tree%2C%0A)%3A%0A%20%20%20%20def%20search_best_decision_tree()%3A%0A%20%20%20%20%20%20%20%20scores%20%3D%20%5B%5D%0A%0A%20%20%20%20%20%20%20%20for%20depth%20in%20%5BNone%2C%201%2C%202%2C%204%2C%208%2C%2016%2C%2032%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20min_samples%20in%20%5B1%2C%205%2C%2010%2C%2050%2C%20100%2C%20250%2C%20500%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20decision_tree%2C%20_%20%3D%20train_decision_tree(df_train%2C%20max_depth%3Ddepth%2C%20min_samples_leaf%3Dmin_samples)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20score%20%3D%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20decision_tree)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20scores.append((depth%2C%20min_samples%2C%20score))%0A%0A%20%20%20%20%20%20%20%20columns%20%3D%20%5B%22max_depth%22%2C%20%22min_samples_leaf%22%2C%20%22roc_auc%22%5D%0A%20%20%20%20%20%20%20%20df_scores%20%3D%20pd.DataFrame(scores%2C%20columns%3Dcolumns)%0A%0A%20%20%20%20%20%20%20%20return%20df_scores%0A%0A%20%20%20%20decision_tree_scores%20%3D%20search_best_decision_tree()%0A%20%20%20%20decision_tree_scores.sort_values(%22roc_auc%22%2C%20ascending%3DFalse).head()%0A%20%20%20%20return%20(decision_tree_scores%2C)%0A%0A%0A%40app.cell%0Adef%20_(decision_tree_scores%2C%20sns)%3A%0A%20%20%20%20sns.heatmap(decision_tree_scores.pivot(index%3D%22min_samples_leaf%22%2C%20columns%3D%5B%22max_depth%22%5D)%2C%20annot%3DTrue%2C%20fmt%3D'.2f')%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Random%20Forest%0A%0A%20%20%20%20A%20random%20forest%20is%20an%20ensemble%20model%20that%20improves%20on%20decision%20trees%20by%20combining%20many%20of%20them%20to%20make%20more%20reliable%20predictions.%20Each%20tree%20in%20the%20forest%20is%20trained%20on%20a%20slightly%20different%20random%20subset%20of%20the%20data%20and%20considers%20only%20a%20random%20selection%20of%20features%20when%20choosing%20splits%2C%20which%20helps%20reduce%20overfitting%20and%20increases%20generalization.%0A%0A%20%20%20%20When%20making%20a%20prediction%2C%20all%20the%20trees%20%E2%80%9Cvote%E2%80%9D%3A%20for%20classification%2C%20the%20class%20chosen%20by%20most%20trees%20is%20the%20final%20output%3B%20for%20regression%2C%20their%20average%20prediction%20is%20used.%20In%20essence%2C%20a%20random%20forest%20uses%20the%20wisdom%20of%20many%20diverse%20trees%20to%20make%20decisions%20that%20are%20more%20stable%20and%20accurate%20than%20any%20single%20tree%20alone.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Train%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(DictVectorizer%2C%20Optional%2C%20df_train%2C%20get_features_and_target%2C%20pd)%3A%0A%20%20%20%20from%20sklearn.ensemble%20import%20RandomForestClassifier%0A%0A%20%20%20%20def%20train_random_forest(df%3A%20pd.DataFrame%2C%20n_estimators%3A%20int%20%3D%2010%2C%20max_depth%3A%20Optional%5Bint%5D%20%3D%20None%2C%20min_samples_leaf%3A%20int%20%3D%201)%20%5C%0A%20%20%20%20%20%20%20%20-%3E%20(RandomForestClassifier%2C%20DictVectorizer%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20random_forest%20%3D%20RandomForestClassifier(n_estimators%3Dn_estimators%2C%20max_depth%3Dmax_depth%2C%20min_samples_leaf%3Dmin_samples_leaf)%0A%20%20%20%20%20%20%20%20X%2C%20y%2C%20dict_vectorizer%20%3D%20get_features_and_target(df)%0A%20%20%20%20%20%20%20%20random_forest.fit(X%2C%20y)%0A%0A%20%20%20%20%20%20%20%20return%20random_forest%2C%20dict_vectorizer%0A%0A%20%20%20%20overfitted_random_forest%2C%20_%20%3D%20train_random_forest(df_train)%0A%20%20%20%20return%20overfitted_random_forest%2C%20train_random_forest%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%20Evaluate%0A%0A%20%20%20%20With%20a%20quick%20evaluation%20we%20can%20see%20that%2C%20as%20it%20happened%20before%2C%20we%20are%20overfitting.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_train%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_roc_auc_score%2C%0A%20%20%20%20overfitted_random_forest%2C%0A)%3A%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%22roc_auc_val%22%3A%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20overfitted_random_forest)%2C%0A%20%20%20%20%20%20%20%20%22roc_auc_train%22%3A%20get_roc_auc_score(df_train%2C%20dict_vectorizer%2C%20overfitted_random_forest)%0A%20%20%20%20%7D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Choose%20the%20Number%20of%20Estimators%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_train%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_roc_auc_score%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20train_random_forest%2C%0A)%3A%0A%20%20%20%20def%20choose_n_estimators()%3A%0A%20%20%20%20%20%20%20%20scores%20%3D%20%5B%5D%0A%0A%20%20%20%20%20%20%20%20for%20estimators%20in%20%5B1%2C%205%2C%2010%2C%2050%2C%20100%2C%20200%2C%20500%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20random_forest%2C%20_%20%3D%20train_random_forest(df_train%2C%20n_estimators%3Destimators)%0A%20%20%20%20%20%20%20%20%20%20%20%20score%20%3D%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20random_forest)%0A%20%20%20%20%20%20%20%20%20%20%20%20scores.append((estimators%2C%20score))%0A%0A%20%20%20%20%20%20%20%20columns%20%3D%20%5B%22estimators%22%2C%20%22roc_auc%22%5D%0A%20%20%20%20%20%20%20%20df_scores%20%3D%20pd.DataFrame(scores%2C%20columns%3Dcolumns)%0A%0A%20%20%20%20%20%20%20%20return%20df_scores%0A%0A%20%20%20%20n_estimators_scores%20%3D%20choose_n_estimators()%0A%20%20%20%20n_estimators_scores.sort_values(%22roc_auc%22%2C%20ascending%3DFalse).head()%0A%20%20%20%20return%20(n_estimators_scores%2C)%0A%0A%0A%40app.cell%0Adef%20_(n_estimators_scores%2C%20sns)%3A%0A%20%20%20%20sns.lineplot(x%3Dn_estimators_scores.estimators%2C%20y%3Dn_estimators_scores.roc_auc)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Tuning%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_train%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_roc_auc_score%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20train_random_forest%2C%0A)%3A%0A%20%20%20%20def%20search_best_random_forest()%3A%0A%20%20%20%20%20%20%20%20scores%20%3D%20%5B%5D%0A%0A%20%20%20%20%20%20%20%20for%20depth%20in%20%5BNone%2C%201%2C%202%2C%204%2C%208%2C%2016%2C%2032%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20min_samples%20in%20%5B1%2C%205%2C%2010%2C%2050%2C%20100%2C%20250%2C%20500%5D%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20random_forest%2C%20_%20%3D%20train_random_forest(df_train%2C%20max_depth%3Ddepth%2C%20n_estimators%3D50)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20score%20%3D%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20random_forest)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20scores.append((depth%2C%20min_samples%2C%20score))%0A%0A%20%20%20%20%20%20%20%20columns%20%3D%20%5B%22max_depth%22%2C%20%22min_samples_leaf%22%2C%20%22roc_auc%22%5D%0A%20%20%20%20%20%20%20%20df_scores%20%3D%20pd.DataFrame(scores%2C%20columns%3Dcolumns)%0A%0A%20%20%20%20%20%20%20%20return%20df_scores%0A%0A%20%20%20%20random_forest_scores%20%3D%20search_best_random_forest()%0A%20%20%20%20random_forest_scores.sort_values(%22roc_auc%22%2C%20ascending%3DFalse).head()%0A%20%20%20%20return%20(random_forest_scores%2C)%0A%0A%0A%40app.cell%0Adef%20_(random_forest_scores%2C%20sns)%3A%0A%20%20%20%20sns.heatmap(random_forest_scores.pivot(index%3D%22min_samples_leaf%22%2C%20columns%3D%5B%22max_depth%22%5D)%2C%20annot%3DTrue%2C%20fmt%3D'.2f')%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20df_train%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_roc_auc_score%2C%0A%20%20%20%20train_random_forest%2C%0A)%3A%0A%20%20%20%20random_forest%2C%20_%20%3D%20train_random_forest(df_train%2C%20min_samples_leaf%3D100%2C%20max_depth%3D8%2C%20n_estimators%3D50)%0A%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%22roc_auc_val%22%3A%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20random_forest)%2C%0A%20%20%20%20%20%20%20%20%22roc_auc_train%22%3A%20get_roc_auc_score(df_train%2C%20dict_vectorizer%2C%20random_forest)%0A%20%20%20%20%7D%0A%20%20%20%20return%20(random_forest%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Gradient%20boosting%20and%20XGBoost%0A%0A%20%20%20%20XGBoost%20(Extreme%20Gradient%20Boosting)%20is%20an%20advanced%20machine%20learning%20algorithm%20that%20builds%20a%20powerful%20model%20by%20combining%20many%20weak%20decision%20trees%20in%20sequence%2C%20where%20each%20new%20tree%20focuses%20on%20correcting%20the%20mistakes%20made%20by%20the%20previous%20ones.%20Instead%20of%20training%20all%20trees%20independently%20(as%20in%20random%20forests)%2C%20XGBoost%20adds%20them%20one%20at%20a%20time%2C%20optimizing%20the%20overall%20model%20through%20a%20process%20called%20gradient%20boosting%2C%20which%20minimizes%20errors%20using%20ideas%20from%20calculus.%0A%0A%20%20%20%20It%20also%20includes%20clever%20techniques%20like%20regularization%20to%20prevent%20overfitting%2C%20handling%20of%20missing%20data%2C%20and%20efficient%20use%20of%20memory%20and%20computation.%20The%20result%20is%20a%20fast%2C%20scalable%2C%20and%20highly%20accurate%20model%20widely%20used%20in%20data%20science%20competitions%20and%20real-world%20applications.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(DictVectorizer%2C%20df_train%2C%20df_val%2C%20get_features_and_target%2C%20pd)%3A%0A%20%20%20%20import%20xgboost%20as%20xgb%0A%20%20%20%20from%20xgboost.core%20import%20Booster%0A%0A%0A%20%20%20%20def%20train_booster(%0A%20%20%20%20%20%20%20%20df_train%3A%20pd.DataFrame%2C%20df_val%3A%20pd.DataFrame%2C%20xgb_params%3A%20dict%20%3D%20%7B%7D%0A%20%20%20%20)%20-%3E%20(Booster%2C%20DictVectorizer)%3A%0A%20%20%20%20%20%20%20%20X_train%2C%20y_train%2C%20dict_vectorizer%20%3D%20get_features_and_target(df_train)%0A%20%20%20%20%20%20%20%20dmatrix_train%20%3D%20xgb.DMatrix(%0A%20%20%20%20%20%20%20%20%20%20%20%20X_train%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20label%3Dy_train%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20feature_names%3Ddict_vectorizer.get_feature_names_out().tolist()%2C%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20X_val%2C%20y_val%2C%20_%20%3D%20get_features_and_target(df_val%2C%20dict_vectorizer%3Ddict_vectorizer)%0A%20%20%20%20%20%20%20%20dmatrix_val%20%3D%20xgb.DMatrix(%0A%20%20%20%20%20%20%20%20%20%20%20%20X_val%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20label%3Dy_val%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20feature_names%3Ddict_vectorizer.get_feature_names_out().tolist()%2C%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20watchlist%20%3D%20%5B(dmatrix_train%2C%20%22train%22)%2C%20(dmatrix_val%2C%20%22val%22)%5D%0A%20%20%20%20%20%20%20%20evals_result%20%3D%20%7B%7D%0A%0A%20%20%20%20%20%20%20%20booster%20%3D%20xgb.train(%0A%20%20%20%20%20%20%20%20%20%20%20%20xgb_params%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20dmatrix_train%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20num_boost_round%3D100%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20evals%3Dwatchlist%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20evals_result%3Devals_result%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20verbose_eval%3DFalse%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20%20%20%20%20return%20booster%2C%20dict_vectorizer%2C%20evals_result%0A%0A%0A%20%20%20%20xgb_params%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22eta%22%3A%200.3%2C%0A%20%20%20%20%20%20%20%20%22max_depth%22%3A%206%2C%0A%20%20%20%20%20%20%20%20%22min_child_weight%22%3A%201%2C%0A%20%20%20%20%20%20%20%20%22objective%22%3A%20%22binary%3Alogistic%22%2C%0A%20%20%20%20%20%20%20%20%22nthread%22%3A%208%2C%0A%20%20%20%20%20%20%20%20%22seed%22%3A%201%2C%0A%20%20%20%20%20%20%20%20%22verbosity%22%3A%200%2C%0A%20%20%20%20%20%20%20%20%22eval_metric%22%3A%20%22auc%22%0A%20%20%20%20%7D%0A%0A%20%20%20%20non_tuned_booster%2C%20_%2C%20evals_result%20%3D%20train_booster(df_train%2C%20df_val%2C%20xgb_params)%0A%20%20%20%20return%20(%0A%20%20%20%20%20%20%20%20Booster%2C%0A%20%20%20%20%20%20%20%20evals_result%2C%0A%20%20%20%20%20%20%20%20non_tuned_booster%2C%0A%20%20%20%20%20%20%20%20train_booster%2C%0A%20%20%20%20%20%20%20%20xgb%2C%0A%20%20%20%20%20%20%20%20xgb_params%2C%0A%20%20%20%20)%0A%0A%0A%40app.cell%0Adef%20_(evals_result%2C%20sns)%3A%0A%20%20%20%20sns.lineplot(evals_result%5B%22train%22%5D%5B%22auc%22%5D%2C%20legend%3D%22brief%22%2C%20label%3D%22Train%22)%0A%20%20%20%20sns.lineplot(evals_result%5B%22val%22%5D%5B%22auc%22%5D%2C%20legend%3D%22brief%22%2C%20label%3D%22Validation%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Predict%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20Booster%2C%0A%20%20%20%20DictVectorizer%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_features_and_target%2C%0A%20%20%20%20non_tuned_booster%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20xgb%2C%0A)%3A%0A%20%20%20%20def%20booster_predict(df%3A%20pd.DataFrame%2C%20booster%3A%20Booster%2C%20dict_vectorizer%3A%20DictVectorizer)%3A%0A%20%20%20%20%20%20%20%20X%2C%20y%2C%20_%20%3D%20get_features_and_target(df%2C%20dict_vectorizer%3Ddict_vectorizer)%0A%20%20%20%20%20%20%20%20dmatrix%20%3D%20xgb.DMatrix(X%2C%20label%3Dy%2C%20feature_names%3Ddict_vectorizer.get_feature_names_out().tolist())%0A%0A%20%20%20%20%20%20%20%20return%20booster.predict(dmatrix)%0A%0A%20%20%20%20booster_predict(df_val%5B10%3A20%5D%2C%20non_tuned_booster%2C%20dict_vectorizer)%0A%20%20%20%20return%20(booster_predict%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Evaluate%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20Booster%2C%0A%20%20%20%20booster_predict%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_features_and_target%2C%0A%20%20%20%20non_tuned_booster%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20roc_auc_score%2C%0A)%3A%0A%20%20%20%20def%20booster_evaluate(df%3A%20pd.DataFrame%2C%20booster%3A%20Booster)%3A%0A%20%20%20%20%20%20%20%20X%2C%20y%2C%20_%20%3D%20get_features_and_target(df%2C%20dict_vectorizer%3Ddict_vectorizer)%0A%20%20%20%20%20%20%20%20y_pred%20%3D%20booster_predict(df%2C%20booster%2C%20dict_vectorizer)%0A%0A%20%20%20%20%20%20%20%20return%20roc_auc_score(y%2C%20y_pred)%0A%0A%20%20%20%20booster_evaluate(df_val%2C%20non_tuned_booster)%0A%20%20%20%20return%20(booster_evaluate%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20XGBoost%20Parameter%20Tuning%0A%0A%20%20%20%20In%20this%20section%20we'll%20be%20tunning%20these%203%20parameters%3A%0A%0A%20%20%20%20*%20%60eta%60%0A%20%20%20%20*%20%60max_depth%60%0A%20%20%20%20*%20%60min_child_weight%60%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_train%2C%20df_val%2C%20train_booster)%3A%0A%20%20%20%20def%20evaluate_parameters(eta%3A%20int%20%3D%200.3%2C%20max_depth%3A%20int%20%3D%206%2C%20min_child_weight%3A%20int%20%3D%201)%3A%0A%20%20%20%20%20%20%20%20xgb_params%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22eta%22%3A%20eta%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22max_depth%22%3A%20max_depth%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22min_child_weight%22%3A%20min_child_weight%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22objective%22%3A%20%22binary%3Alogistic%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22nthread%22%3A%208%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22seed%22%3A%201%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22verbosity%22%3A%201%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22eval_metric%22%3A%20%22auc%22%0A%20%20%20%20%20%20%20%20%7D%0A%0A%20%20%20%20%20%20%20%20_%2C%20_%2C%20evals_result%20%3D%20train_booster(df_train%2C%20df_val%2C%20xgb_params)%0A%0A%20%20%20%20%20%20%20%20return%20evals_result%0A%20%20%20%20return%20(evaluate_parameters%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Parameter%20%60eta%60%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(evaluate_parameters%2C%20plt%2C%20sns)%3A%0A%20%20%20%20def%20evaluate_eta()%3A%0A%20%20%20%20%20%20%20%20eta_values%20%3D%20%5B0.1%2C%200.5%2C%201.0%5D%0A%0A%20%20%20%20%20%20%20%20for%20eta_value%20in%20eta_values%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20eta_eval%20%3D%20evaluate_parameters(eta%3Deta_value)%0A%20%20%20%20%20%20%20%20%20%20%20%20sns.lineplot(eta_eval%5B%22val%22%5D%5B%22auc%22%5D%2C%20legend%3D%22brief%22%2C%20label%3D%22Eta%20%3D%20%25s%22%20%25%20eta_value)%0A%0A%20%20%20%20%20%20%20%20plt.show()%0A%0A%20%20%20%20evaluate_eta()%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Parameter%20%60max_depth%60%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(evaluate_parameters%2C%20plt%2C%20sns)%3A%0A%20%20%20%20def%20evaluate_max_depth()%3A%0A%20%20%20%20%20%20%20%20max_depth_values%20%3D%20%5B5%2C%2010%2C%2020%5D%0A%0A%20%20%20%20%20%20%20%20for%20max_depth_value%20in%20max_depth_values%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20max_depth_eval%20%3D%20evaluate_parameters(max_depth%3Dmax_depth_value)%0A%20%20%20%20%20%20%20%20%20%20%20%20sns.lineplot(max_depth_eval%5B%22val%22%5D%5B%22auc%22%5D%2C%20legend%3D%22brief%22%2C%20label%3D%22Maximum%20depth%20%3D%20%25s%22%20%25%20max_depth_value)%0A%0A%20%20%20%20%20%20%20%20plt.show()%0A%0A%20%20%20%20evaluate_max_depth()%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Parameter%20%60min_child_weight%60%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(evaluate_parameters%2C%20plt%2C%20sns)%3A%0A%20%20%20%20def%20evaluate_min_child_weight()%3A%0A%20%20%20%20%20%20%20%20min_child_weight_values%20%3D%20%5B1%2C%205%2C%2010%5D%0A%0A%20%20%20%20%20%20%20%20for%20min_child_weight_value%20in%20min_child_weight_values%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20min_child_weight_eval%20%3D%20evaluate_parameters(min_child_weight%3Dmin_child_weight_value)%0A%20%20%20%20%20%20%20%20%20%20%20%20sns.lineplot(min_child_weight_eval%5B%22val%22%5D%5B%22auc%22%5D%2C%20legend%3D%22brief%22%2C%20label%3D%22Minimun%20child%20weight%20%3D%20%25s%22%20%25%20min_child_weight_value)%0A%0A%20%20%20%20%20%20%20%20plt.show()%0A%0A%20%20%20%20evaluate_min_child_weight()%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(df_train%2C%20df_val%2C%20train_booster%2C%20xgb_params)%3A%0A%20%20%20%20final_xgb_params%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22eta%22%3A%201%2C%0A%20%20%20%20%20%20%20%20%22max_depth%22%3A%205%2C%0A%20%20%20%20%20%20%20%20%22min_child_weight%22%3A%2010%2C%0A%20%20%20%20%20%20%20%20%22objective%22%3A%20%22binary%3Alogistic%22%2C%0A%20%20%20%20%20%20%20%20%22nthread%22%3A%208%2C%0A%20%20%20%20%20%20%20%20%22seed%22%3A%201%2C%0A%20%20%20%20%20%20%20%20%22verbosity%22%3A%200%2C%0A%20%20%20%20%20%20%20%20%22eval_metric%22%3A%20%22auc%22%0A%20%20%20%20%7D%0A%0A%20%20%20%20tuned_booster%2C%20_%2C%20tuned_evals%20%3D%20train_booster(df_train%2C%20df_val%2C%20xgb_params)%0A%20%20%20%20return%20final_xgb_params%2C%20tuned_booster%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%20Final%20Model%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Compare%20the%20Current%20Models%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(%0A%20%20%20%20booster_evaluate%2C%0A%20%20%20%20decision_tree%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20get_roc_auc_score%2C%0A%20%20%20%20random_forest%2C%0A%20%20%20%20tuned_booster%2C%0A)%3A%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%22decision_tree_auc%22%3A%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20decision_tree)%2C%0A%20%20%20%20%20%20%20%20%22random_forest_auc%22%3A%20get_roc_auc_score(df_val%2C%20dict_vectorizer%2C%20random_forest)%2C%0A%20%20%20%20%20%20%20%20%22xg_booster%22%3A%20booster_evaluate(df_val%2C%20tuned_booster)%2C%0A%20%20%20%20%7D%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%20Train%20the%20Final%20Model%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20booster_evaluate%2C%0A%20%20%20%20df_full%2C%0A%20%20%20%20df_test%2C%0A%20%20%20%20df_val%2C%0A%20%20%20%20final_xgb_params%2C%0A%20%20%20%20train_booster%2C%0A)%3A%0A%20%20%20%20final_booster%2C%20_%2C%20final_evals%20%3D%20train_booster(df_full%2C%20df_val%2C%20final_xgb_params)%0A%20%20%20%20booster_evaluate(df_test%2C%20final_booster)%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A