homework

import%20marimo%0A%0A__generated_with%20%3D%20%220.16.0%22%0Aapp%20%3D%20marimo.App(width%3D%22medium%22)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20import%20pandas%20as%20pd%0A%20%20%20%20import%20numpy%20as%20np%0A%20%20%20%20import%20matplotlib.pyplot%20as%20plt%0A%20%20%20%20import%20seaborn%20as%20sns%0A%20%20%20%20return%20mo%2C%20np%2C%20pd%2C%20plt%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%20Module%204%3A%20%5BEvaluation%5D(https%3A%2F%2Fgithub.com%2FDataTalksClub%2Fmachine-learning-zoomcamp%2Ftree%2Fmaster%2F04-evaluation)%0A%0A%20%20%20%20%23%23%20Homework%0A%0A%20%20%20%20%23%23%23%20Dataset%0A%0A%20%20%20%20For%20this%20homework%2C%20we'll%20use%20the%20lead%20scoring%20Bank%20Marketing%20dataset.%20Download%20it%20from%20here.%20You%20can%20do%20it%20with%20wget%3A%0A%0A%20%20%20%20wget%20%5Bhttps%3A%2F%2Fraw.githubusercontent.com%2Falexeygrigorev%2Fdatasets%2Fmaster%2Fcourse_lead_scoring.csv%5D(https%3A%2F%2Fraw.githubusercontent.com%2Falexeygrigorev%2Fdatasets%2Fmaster%2Fcourse_lead_scoring.csv)%0A%0A%20%20%20%20In%20this%20dataset%20our%20desired%20target%20for%20classification%20task%20will%20be%20**converted**%20variable%20-%20has%20the%20client%20signed%20up%20to%20the%20platform%20or%20not.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(pd)%3A%0A%20%20%20%20def%20read_dataframe()%3A%0A%20%20%20%20%20%20%20%20return%20pd.read_csv(%22.%2Fmodule-4%2Fdata%2Fcourse_lead_scoring.csv%22)%0A%0A%20%20%20%20raw_dataframe%20%3D%20read_dataframe()%0A%20%20%20%20raw_dataframe.head()%0A%20%20%20%20return%20(raw_dataframe%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Data%20preparation%0A%0A%20%20%20%20%23%23%23%20Check%20if%20the%20missing%20values%20are%20presented%20in%20the%20features%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(raw_dataframe)%3A%0A%20%20%20%20raw_dataframe.isnull().sum()%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%20If%20there%20are%20missing%20values%3A%0A%0A%20%20%20%20%23%23%23%23%20For%20categorical%20features%2C%20replace%20them%20with%20'NA'%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(raw_dataframe)%3A%0A%20%20%20%20categorical_columns%20%3D%20%5B%22lead_source%22%2C%20%22industry%22%2C%20%22employment_status%22%2C%20%22location%22%5D%0A%0A%20%20%20%20def%20fill_categorical_values(dataframe)%3A%0A%20%20%20%20%20%20%20%20dataframe%5Bcategorical_columns%5D%20%3D%20dataframe%5Bcategorical_columns%5D.fillna('NA')%0A%0A%20%20%20%20%20%20%20%20return%20dataframe%0A%0A%20%20%20%20fill_categorical_values(raw_dataframe).head()%0A%20%20%20%20return%20categorical_columns%2C%20fill_categorical_values%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%23%23%23%23%20For%20numerical%20features%2C%20replace%20with%20with%200.0%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(raw_dataframe)%3A%0A%20%20%20%20numeric_columns%20%3D%20%5B%22number_of_courses_viewed%22%2C%20%22annual_income%22%2C%20%22interaction_count%22%2C%20%22lead_score%22%5D%0A%0A%20%20%20%20def%20fill_numerical_values(dataframe)%3A%0A%20%20%20%20%20%20%20%20dataframe%5Bnumeric_columns%5D%20%3D%20dataframe%5Bnumeric_columns%5D.fillna(0.0)%0A%0A%20%20%20%20%20%20%20%20return%20dataframe%0A%0A%20%20%20%20fill_numerical_values(raw_dataframe).head()%0A%20%20%20%20return%20fill_numerical_values%2C%20numeric_columns%0A%0A%0A%40app.cell%0Adef%20_(fill_categorical_values%2C%20fill_numerical_values%2C%20raw_dataframe)%3A%0A%20%20%20%20filled_dataframe%20%3D%20fill_numerical_values(fill_categorical_values(raw_dataframe.copy()))%0A%20%20%20%20filled_dataframe.head()%0A%20%20%20%20return%20(filled_dataframe%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%23%20Split%20the%20data%20into%203%20parts%0A%0A%20%20%20%20Split%20the%20data%20into%203%20parts%3A%20train%2Fvalidation%2Ftest%20with%2060%25%20%2F%2020%25%20%2F%2020%25%20distribution.%20Use%20train_test_split%20function%20for%20that%20with%20random_state%3D1.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(filled_dataframe)%3A%0A%20%20%20%20from%20sklearn.model_selection%20import%20train_test_split%0A%0A%20%20%20%20full_dataframe%2C%20test_dataframe%20%3D%20train_test_split(filled_dataframe%2C%20test_size%3D0.2%2C%20random_state%3D1)%0A%20%20%20%20train_dataframe%2C%20val_dataframe%20%3D%20train_test_split(full_dataframe%2C%20test_size%3D0.25%2C%20random_state%3D1)%0A%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%22len(train_dataframe)%22%3A%20len(train_dataframe)%2C%0A%20%20%20%20%20%20%20%20%22len(val_dataframe)%22%3A%20len(val_dataframe)%2C%0A%20%20%20%20%20%20%20%20%22len(test_dataframe)%22%3A%20len(test_dataframe)%2C%0A%20%20%20%20%7D%0A%20%20%20%20return%20full_dataframe%2C%20train_dataframe%2C%20val_dataframe%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Question%201%3A%20ROC%20AUC%20feature%20importance%0A%0A%20%20%20%20ROC%20AUC%20could%20also%20be%20used%20to%20evaluate%20feature%20importance%20of%20numerical%20variables.%20%0A%0A%20%20%20%20Let's%20do%20that%0A%0A%20%20%20%20*%20For%20each%20numerical%20variable%2C%20use%20it%20as%20score%20(aka%20prediction)%20and%20compute%20the%20AUC%20with%20the%20%60y%60%20variable%20as%20ground%20truth.%0A%20%20%20%20*%20Use%20the%20training%20dataset%20for%20that%0A%0A%0A%20%20%20%20If%20your%20AUC%20is%20%3C%200.5%2C%20invert%20this%20variable%20by%20putting%20%22-%22%20in%20front%0A%0A%20%20%20%20(e.g.%20%60-df_train%5B'balance'%5D%60)%0A%0A%20%20%20%20AUC%20can%20go%20below%200.5%20if%20the%20variable%20is%20negatively%20correlated%20with%20the%20target%20variable.%20You%20can%20change%20the%20direction%20of%20the%20correlation%20by%20negating%20this%20variable%20-%20then%20negative%20correlation%20becomes%20positive.%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(numeric_columns%2C%20pd%2C%20train_dataframe)%3A%0A%20%20%20%20from%20sklearn.metrics%20import%20roc_auc_score%0A%0A%20%20%20%20def%20compute_auc_for_numeric_features(dataframe%3A%20pd.DataFrame)%20-%3E%20dict%3A%0A%20%20%20%20%20%20%20%20auc_scores%20%3D%20%7B%7D%0A%0A%20%20%20%20%20%20%20%20for%20feature%20in%20numeric_columns%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20auc_scores%5Bfeature%5D%20%3D%20roc_auc_score(dataframe.converted%2C%20dataframe%5Bfeature%5D)%0A%0A%20%20%20%20%20%20%20%20return%20auc_scores%0A%0A%20%20%20%20compute_auc_for_numeric_features(train_dataframe)%0A%20%20%20%20return%20(roc_auc_score%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20Which%20numerical%20variable%20(among%20the%20following%204)%20has%20the%20highest%20AUC%3F%0A%0A%20%20%20%20-%20%60lead_score%60%0A%20%20%20%20-%20%60number_of_courses_viewed%60%0A%20%20%20%20-%20%60interaction_count%60%0A%20%20%20%20-%20%60annual_income%60%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22The%20%60number_of_courses_viewed%60%20feature%20has%20the%20highest%20AUC%20with%20a%20value%20of%20%240.7564%24.%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%20Question%202%3A%20Training%20the%20model%0A%0A%20%20%20%20Apply%20one-hot-encoding%20using%20%60DictVectorizer%60%20and%20train%20the%20logistic%20regression%20with%20these%20parameters%3A%0A%0A%20%20%20%20%60%60%60python%0A%20%20%20%20LogisticRegression(solver%3D'liblinear'%2C%20C%3D1.0%2C%20max_iter%3D1000)%0A%20%20%20%20%60%60%60%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20categorical_columns%2C%0A%20%20%20%20numeric_columns%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20roc_auc_score%2C%0A%20%20%20%20train_dataframe%2C%0A%20%20%20%20val_dataframe%2C%0A)%3A%0A%20%20%20%20from%20sklearn.linear_model%20import%20LogisticRegression%0A%20%20%20%20from%20sklearn.feature_extraction%20import%20DictVectorizer%0A%0A%20%20%20%20def%20separate_target(dataframe%3A%20pd.DataFrame)%3A%0A%20%20%20%20%20%20%20%20y%20%3D%20dataframe.converted%0A%20%20%20%20%20%20%20%20X%20%3D%20dataframe.drop(columns%3D%5B'converted'%5D)%0A%20%20%20%20%20%20%20%20return%20X%5Bnumeric_columns%20%2B%20categorical_columns%5D%2C%20y%0A%0A%20%20%20%20def%20train(dataframe%3A%20pd.DataFrame)%3A%0A%20%20%20%20%20%20%20%20X%2C%20y%20%3D%20separate_target(dataframe)%0A%0A%20%20%20%20%20%20%20%20dict_vectorizer%20%3D%20DictVectorizer(sparse%3DFalse)%0A%20%20%20%20%20%20%20%20X_dict%20%3D%20dict_vectorizer.fit_transform(X.to_dict(orient%3D%22records%22))%0A%0A%20%20%20%20%20%20%20%20model%20%3D%20LogisticRegression(solver%3D'liblinear'%2C%20C%3D1.0%2C%20max_iter%3D1000%2C%20random_state%3D1)%0A%20%20%20%20%20%20%20%20model.fit(X_dict%2C%20y)%0A%0A%20%20%20%20%20%20%20%20return%20dict_vectorizer%2C%20model%0A%0A%20%20%20%20def%20evaluate(%0A%20%20%20%20%20%20%20%20model%3A%20LogisticRegression%2C%0A%20%20%20%20%20%20%20%20dict_vectorizer%3A%20DictVectorizer%2C%0A%20%20%20%20%20%20%20%20dataframe%3A%20pd.DataFrame%0A%20%20%20%20)%20-%3E%20float%3A%0A%20%20%20%20%20%20%20%20X%2C%20y%20%3D%20separate_target(dataframe)%0A%20%20%20%20%20%20%20%20X_dict%20%3D%20dict_vectorizer.transform(X.to_dict(orient%3D%22records%22))%0A%0A%20%20%20%20%20%20%20%20y_pred%20%3D%20model.predict_proba(X_dict)%5B%3A%2C%201%5D%20%3E%3D%200.55%0A%0A%20%20%20%20%20%20%20%20return%20round(roc_auc_score(y%2C%20y_pred)%2C%203)%0A%0A%20%20%20%20dict_vectorizer%2C%20model%20%3D%20train(train_dataframe)%0A%20%20%20%20evaluate(model%2C%20dict_vectorizer%2C%20val_dataframe)%0A%20%20%20%20return%20(%0A%20%20%20%20%20%20%20%20DictVectorizer%2C%0A%20%20%20%20%20%20%20%20LogisticRegression%2C%0A%20%20%20%20%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20%20%20%20%20model%2C%0A%20%20%20%20%20%20%20%20separate_target%2C%0A%20%20%20%20)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20What's%20the%20AUC%20of%20this%20model%20on%20the%20validation%20dataset%3F%20(round%20to%203%20digits)%0A%0A%20%20%20%20-%200.32%0A%20%20%20%20-%200.52%0A%20%20%20%20-%200.72%0A%20%20%20%20-%200.92%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22The%20closest%20suggested%20answer%20to%20the%20AUC%20of%20this%20model%20is%200.72.%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%20Question%203%3A%20Precision%20and%20Recall%0A%0A%20%20%20%20Now%20let's%20compute%20precision%20and%20recall%20for%20our%20model.%0A%0A%20%20%20%20*%20Evaluate%20the%20model%20on%20all%20thresholds%20from%200.0%20to%201.0%20with%20step%200.01%0A%20%20%20%20*%20For%20each%20threshold%2C%20compute%20precision%20and%20recall%0A%20%20%20%20*%20Plot%20them%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20DictVectorizer%2C%0A%20%20%20%20LogisticRegression%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20model%2C%0A%20%20%20%20np%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20plt%2C%0A%20%20%20%20separate_target%2C%0A%20%20%20%20val_dataframe%2C%0A)%3A%0A%20%20%20%20from%20sklearn.metrics%20import%20recall_score%2C%20precision_score%0A%0A%20%20%20%20def%20precision_and_recall_for_threshold(%0A%20%20%20%20%20%20%20%20model%3A%20LogisticRegression%2C%0A%20%20%20%20%20%20%20%20dict_vectorizer%3A%20DictVectorizer%2C%0A%20%20%20%20%20%20%20%20dataframe%3A%20pd.DataFrame%2C%0A%20%20%20%20%20%20%20%20threshold%3A%20float%0A%20%20%20%20)%20-%3E%20tuple%3A%0A%20%20%20%20%20%20%20%20X%2C%20y%20%3D%20separate_target(dataframe)%0A%20%20%20%20%20%20%20%20X_dict%20%3D%20dict_vectorizer.transform(X.to_dict(orient%3D%22records%22))%0A%20%20%20%20%20%20%20%20y_pred%20%3D%20model.predict_proba(X_dict)%5B%3A%2C%201%5D%20%3E%3D%20threshold%0A%0A%20%20%20%20%20%20%20%20return%20precision_score(y%2C%20y_pred%2C%20zero_division%3D1.0)%2C%20recall_score(y%2C%20y_pred)%0A%0A%20%20%20%20def%20evaluate_precision_and_recall(%0A%20%20%20%20%20%20%20%20model%3A%20LogisticRegression%2C%0A%20%20%20%20%20%20%20%20dict_vectorizer%3A%20DictVectorizer%2C%0A%20%20%20%20%20%20%20%20dataframe%3A%20pd.DataFrame%2C%0A%20%20%20%20)%20-%3E%20list%3A%0A%20%20%20%20%20%20%20%20evaluation%20%3D%20%5B%5D%0A%0A%20%20%20%20%20%20%20%20for%20threshold%20in%20np.linspace(0%2C%201%2C%20100)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20precision%2C%20recall%20%3D%20precision_and_recall_for_threshold(model%2C%20dict_vectorizer%2C%20dataframe%2C%20threshold)%0A%20%20%20%20%20%20%20%20%20%20%20%20evaluation.append((threshold%2C%20precision%2C%20recall%2C))%0A%0A%20%20%20%20%20%20%20%20return%20pd.DataFrame(evaluation%2C%20columns%3D%5B%22threshold%22%2C%20%22precision%22%2C%20%22recall%22%5D)%0A%0A%20%20%20%20def%20plot_precision_and_recall(%0A%20%20%20%20%20%20%20%20model%3A%20LogisticRegression%2C%0A%20%20%20%20%20%20%20%20dict_vectorizer%3A%20DictVectorizer%2C%0A%20%20%20%20%20%20%20%20dataframe%3A%20pd.DataFrame%2C%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20evaluation%20%3D%20evaluate_precision_and_recall(model%2C%20dict_vectorizer%2C%20dataframe)%0A%0A%20%20%20%20%20%20%20%20plt.plot(evaluation.threshold%2C%20evaluation.precision%2C%20label%3D%22Precision%22%2C%20color%3D%22g%22)%0A%20%20%20%20%20%20%20%20plt.plot(evaluation.threshold%2C%20evaluation.recall%2C%20label%3D%22Recall%22%2C%20color%3D%22r%22)%0A%20%20%20%20%20%20%20%20plt.vlines(0.64%2C%200%2C%201)%0A%20%20%20%20%20%20%20%20plt.xlabel(%22Threshold%22)%0A%20%20%20%20%20%20%20%20plt.show()%0A%0A%20%20%20%20plot_precision_and_recall(model%2C%20dict_vectorizer%2C%20val_dataframe)%0A%20%20%20%20return%20(evaluate_precision_and_recall%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20At%20which%20threshold%20precision%20and%20recall%20curves%20intersect%3F%0A%0A%20%20%20%20*%200.145%0A%20%20%20%20*%200.345%0A%20%20%20%20*%200.545%0A%20%20%20%20*%200.745%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22The%20closest%20suggested%20is%200.545.%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%20Question%204%3A%20F1%20score%0A%0A%20%20%20%20Precision%20and%20recall%20are%20conflicting%20-%20when%20one%20grows%2C%20the%20other%20goes%20down.%20That's%20why%20they%20are%20often%20combined%20into%20the%20F1%20score%20-%20a%20metrics%20that%20takes%20into%20account%20both%0A%0A%20%20%20%20This%20is%20the%20formula%20for%20computing%20F1%3A%0A%0A%20%20%20%20%24%24F_1%20%3D%202%20%5Ccdot%20%5Ccfrac%7BP%20%5Ccdot%20R%7D%7BP%20%2B%20R%7D%24%24%0A%0A%20%20%20%20Where%20%24P%24%20is%20precision%20and%20%24R%24%20is%20recall.%0A%0A%20%20%20%20Let's%20compute%20F1%20for%20all%20thresholds%20from%200.0%20to%201.0%20with%20increment%200.01%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20DictVectorizer%2C%0A%20%20%20%20LogisticRegression%2C%0A%20%20%20%20dict_vectorizer%2C%0A%20%20%20%20evaluate_precision_and_recall%2C%0A%20%20%20%20model%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20plt%2C%0A%20%20%20%20val_dataframe%2C%0A)%3A%0A%20%20%20%20def%20evaluate_with_f1(%0A%20%20%20%20%20%20%20%20model%3A%20LogisticRegression%2C%0A%20%20%20%20%20%20%20%20dict_vectorizer%3A%20DictVectorizer%2C%0A%20%20%20%20%20%20%20%20dataframe%3A%20pd.DataFrame%2C%0A%20%20%20%20)%3A%0A%20%20%20%20%20%20%20%20evaluation%20%3D%20evaluate_precision_and_recall(model%2C%20dict_vectorizer%2C%20dataframe)%0A%20%20%20%20%20%20%20%20evaluation%5B%22f1%22%5D%20%3D%202%20*%20evaluation.precision%20*%20evaluation.recall%20%2F%20(evaluation.precision%20%2B%20evaluation.recall)%0A%0A%20%20%20%20%20%20%20%20return%20evaluation%0A%0A%20%20%20%20def%20plot_f1(evaluation%3A%20pd.DataFrame)%3A%0A%20%20%20%20%20%20%20%20plt.plot(evaluation.threshold%2C%20evaluation.f1%2C%20label%3D%22F1%22%2C%20color%3D%22b%22)%0A%20%20%20%20%20%20%20%20plt.xlabel(%22Threshold%22)%0A%20%20%20%20%20%20%20%20plt.show()%0A%0A%20%20%20%20def%20find_max(evaluation%3A%20pd.DataFrame)%3A%0A%20%20%20%20%20%20%20%20f1_max%20%3D%20evaluation%5Bevaluation.f1%20%3D%3D%20evaluation.f1.max()%5D.iloc%5B0%5D%0A%20%20%20%20%20%20%20%20print(%22F1%20max%20is%20%7B%3A.3f%7D%20and%20it's%20reach%20at%20%7B%3A.3f%7D%22.format(f1_max.f1%2C%20f1_max.threshold))%0A%0A%20%20%20%20f1_evaluation%20%3D%20evaluate_with_f1(model%2C%20dict_vectorizer%2C%20val_dataframe)%0A%20%20%20%20plot_f1(f1_evaluation)%0A%20%20%20%20find_max(f1_evaluation)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20At%20which%20threshold%20F1%20is%20maximal%3F%0A%0A%20%20%20%20-%200.14%0A%20%20%20%20-%200.34%0A%20%20%20%20-%200.54%0A%20%20%20%20-%200.74%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22The%20closest%20suggested%20option%20to%20the%20threshold%20where%20F1%20is%20maximal%20is%200.54.%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%20Question%205%3A%205-Fold%20CV%0A%0A%0A%20%20%20%20Use%20the%20%60KFold%60%20class%20from%20Scikit-Learn%20to%20evaluate%20our%20model%20on%205%20different%20folds%3A%0A%0A%20%20%20%20%60%60%60%0A%20%20%20%20KFold(n_splits%3D5%2C%20shuffle%3DTrue%2C%20random_state%3D1)%0A%20%20%20%20%60%60%60%0A%0A%20%20%20%20*%20Iterate%20over%20different%20folds%20of%20%60df_full_train%60%0A%20%20%20%20*%20Split%20the%20data%20into%20train%20and%20validation%0A%20%20%20%20*%20Train%20the%20model%20on%20train%20with%20these%20parameters%3A%20%60LogisticRegression(solver%3D'liblinear'%2C%20C%3D1.0%2C%20max_iter%3D1000)%60%0A%20%20%20%20*%20Use%20AUC%20to%20evaluate%20the%20model%20on%20validation%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20DictVectorizer%2C%0A%20%20%20%20LogisticRegression%2C%0A%20%20%20%20categorical_columns%2C%0A%20%20%20%20full_dataframe%2C%0A%20%20%20%20np%2C%0A%20%20%20%20numeric_columns%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20roc_auc_score%2C%0A)%3A%0A%20%20%20%20from%20sklearn.model_selection%20import%20KFold%0A%0A%20%20%20%20def%20get_trained_vectorizer(dataframe%3A%20pd.DataFrame)%20-%3E%20list%5Bdict%5D%3A%0A%20%20%20%20%20%20%20%20copy%20%3D%20dataframe.copy()%0A%20%20%20%20%20%20%20%20del%20copy%5B%22converted%22%5D%0A%20%20%20%20%20%20%20%20dictionary%20%3D%20copy.to_dict(orient%3D%22records%22)%0A%0A%20%20%20%20%20%20%20%20dict_vectorizer%20%3D%20DictVectorizer(sparse%3DFalse)%0A%20%20%20%20%20%20%20%20dict_vectorizer.fit(dictionary)%0A%0A%20%20%20%20%20%20%20%20return%20dict_vectorizer%2C%20dictionary%0A%0A%20%20%20%20def%20get_features_and_target(dataframe%3A%20pd.DataFrame%2C%20dict_vectorizer%3A%20DictVectorizer%2C%20dictionary)%3A%0A%20%20%20%20%20%20%20%20X%20%3D%20dict_vectorizer.transform(dictionary)%0A%20%20%20%20%20%20%20%20y%20%3D%20dataframe.converted%20%3D%3D%201%0A%0A%20%20%20%20%20%20%20%20return%20X%2C%20y%0A%0A%20%20%20%20def%20train_folds(df_full%3A%20pd.DataFrame)%3A%0A%20%20%20%20%20%20%20%20kfolds%20%3D%20KFold(n_splits%3D5%2C%20shuffle%3DTrue%2C%20random_state%3D1)%0A%0A%20%20%20%20%20%20%20%20auc_scores%20%3D%20%5B%5D%0A%20%20%20%20%20%20%20%20for%20train_idx%2C%20val_idx%20in%20kfolds.split(df_full)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df_train%20%3D%20df_full.iloc%5Btrain_idx%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20df_val%20%3D%20df_full.iloc%5Bval_idx%5D%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20dict_vectorizer%2C%20dictionary%20%3D%20get_trained_vectorizer(df_train)%0A%20%20%20%20%20%20%20%20%20%20%20%20X_train%2C%20y_train%20%3D%20get_features_and_target(df_train%2C%20dict_vectorizer%2C%20dictionary)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20model%20%3D%20LogisticRegression(solver%3D'liblinear'%2C%20C%3D1.0%2C%20max_iter%3D1000)%0A%20%20%20%20%20%20%20%20%20%20%20%20model.fit(X_train%2C%20y_train)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20dictionary_val%20%3D%20df_val%5Bnumeric_columns%20%2B%20categorical_columns%5D.to_dict(orient%3D%22records%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20X_val%2C%20y_val%20%3D%20get_features_and_target(df_val%2C%20dict_vectorizer%2C%20dictionary_val)%0A%20%20%20%20%20%20%20%20%20%20%20%20y_pred%20%3D%20model.predict_proba(X_val)%5B%3A%2C1%5D%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20auc_scores.append(roc_auc_score(y_val%2C%20y_pred))%0A%0A%20%20%20%20%20%20%20%20print(%22%7B%3A.3f%7D%20%2B-%20%7B%3A.3f%7D%22.format(np.mean(auc_scores)%2C%20np.std(auc_scores)))%0A%0A%20%20%20%20%20%20%20%20return%20auc_scores%0A%0A%20%20%20%20train_folds(full_dataframe)%0A%20%20%20%20return%20KFold%2C%20get_features_and_target%2C%20get_trained_vectorizer%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20How%20large%20is%20standard%20deviation%20of%20the%20scores%20across%20different%20folds%3F%0A%0A%20%20%20%20-%200.0001%0A%20%20%20%20-%200.006%0A%20%20%20%20-%200.06%0A%20%20%20%20-%200.36%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22The%20closest%20value%20to%20the%20standard%20deviation%20across%20folds%20is%20%240.06%24.%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20%23%23%23%20Question%206%3A%20Hyperparameter%20Tuning%0A%0A%20%20%20%20Now%20let's%20use%205-Fold%20cross-validation%20to%20find%20the%20best%20parameter%20%60C%60%0A%0A%20%20%20%20*%20Iterate%20over%20the%20following%20%60C%60%20values%3A%20%60%5B0.000001%2C%200.001%2C%201%5D%60%0A%20%20%20%20*%20Initialize%20%60KFold%60%20with%20the%20same%20parameters%20as%20previously%0A%20%20%20%20*%20Use%20these%20parameters%20for%20the%20model%3A%20%60LogisticRegression(solver%3D'liblinear'%2C%20C%3DC%2C%20max_iter%3D1000)%60%0A%20%20%20%20*%20Compute%20the%20mean%20score%20as%20well%20as%20the%20std%20(round%20the%20mean%20and%20std%20to%203%20decimal%20digits)%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20KFold%2C%0A%20%20%20%20LogisticRegression%2C%0A%20%20%20%20categorical_columns%2C%0A%20%20%20%20full_dataframe%2C%0A%20%20%20%20get_features_and_target%2C%0A%20%20%20%20get_trained_vectorizer%2C%0A%20%20%20%20np%2C%0A%20%20%20%20numeric_columns%2C%0A%20%20%20%20pd%2C%0A%20%20%20%20roc_auc_score%2C%0A)%3A%0A%20%20%20%20def%20hypertune_folds(df_full%3A%20pd.DataFrame%2C%20C%3A%20float)%3A%0A%20%20%20%20%20%20%20%20kfolds%20%3D%20KFold(n_splits%3D5%2C%20shuffle%3DTrue%2C%20random_state%3D1)%0A%0A%20%20%20%20%20%20%20%20auc_scores%20%3D%20%5B%5D%0A%20%20%20%20%20%20%20%20for%20train_idx%2C%20val_idx%20in%20kfolds.split(df_full)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20df_train%20%3D%20df_full.iloc%5Btrain_idx%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20df_val%20%3D%20df_full.iloc%5Bval_idx%5D%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20dict_vectorizer%2C%20dictionary%20%3D%20get_trained_vectorizer(df_train)%0A%20%20%20%20%20%20%20%20%20%20%20%20X_train%2C%20y_train%20%3D%20get_features_and_target(df_train%2C%20dict_vectorizer%2C%20dictionary)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20model%20%3D%20LogisticRegression(solver%3D'liblinear'%2C%20C%3DC%2C%20max_iter%3D1000)%0A%20%20%20%20%20%20%20%20%20%20%20%20model.fit(X_train%2C%20y_train)%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20dictionary_val%20%3D%20df_val%5Bnumeric_columns%20%2B%20categorical_columns%5D.to_dict(orient%3D%22records%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20X_val%2C%20y_val%20%3D%20get_features_and_target(df_val%2C%20dict_vectorizer%2C%20dictionary_val)%0A%20%20%20%20%20%20%20%20%20%20%20%20y_pred%20%3D%20model.predict_proba(X_val)%5B%3A%2C1%5D%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20auc_scores.append(roc_auc_score(y_val%2C%20y_pred))%0A%0A%20%20%20%20%20%20%20%20print(%22C%3D%7B%3A.3f%7D%3A%20%7B%3A.3f%7D%20%2B-%20%7B%3A.3f%7D%22.format(C%2C%20np.mean(auc_scores)%2C%20np.std(auc_scores)))%0A%0A%20%20%20%20%20%20%20%20return%20auc_scores%0A%0A%20%20%20%20def%20test_c_values(df_full%3A%20pd.DataFrame%2C%20C_values%3A%20list%5Bfloat%5D)%3A%0A%20%20%20%20%20%20%20%20auc_scores%20%3D%20%5B%5D%0A%20%20%20%20%20%20%20%20for%20C_value%20in%20C_values%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20auc_scores.append(hypertune_folds(full_dataframe%2C%20C_value))%0A%0A%20%20%20%20%20%20%20%20return%20auc_scores%0A%0A%20%20%20%20test_c_values(full_dataframe%2C%20%5B0.000001%2C%200.001%2C%201%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20r%22%22%22%0A%20%20%20%20Which%20%60C%60%20leads%20to%20the%20best%20mean%20score%3F%0A%0A%20%20%20%20-%200.000001%0A%20%20%20%20-%200.001%0A%20%20%20%20-%201%0A%20%20%20%20%22%22%22%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22The%20best%20score%20corresponds%20to%20%24C%20%3D%200.001%24.%22%22%22)%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A