| |
|
|
| import pandas as pd |
| import numpy as np |
| import streamlit as st |
| import pandas as pd |
| import numpy as np |
| import seaborn as sn |
| import matplotlib.pyplot as plt |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.preprocessing import MinMaxScaler, StandardScaler |
| from sklearn.metrics import confusion_matrix, classification_report |
| from sklearn.model_selection import train_test_split |
| import xgboost as xgb |
| from sklearn.linear_model import LinearRegression |
| from sklearn.metrics import mean_squared_error, r2_score |
| from sklearn.decomposition import PCA |
| from sklearn.preprocessing import StandardScaler |
| import numpy as np |
| import plotly.figure_factory as ff |
|
|
|
|
| st.set_page_config( |
| layout="wide", |
| ) |
|
|
| def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None): |
| |
| |
|
|
| Xs = df.drop(columns=[identifier, flag],axis=1) |
| X_scaled = StandardScaler().fit_transform(Xs) |
| n_comp = len(Xs.columns) |
| pca = PCA(n_components=n_comp) |
| pca.fit(X_scaled) |
| princ_comp = pca.transform(X_scaled) |
| PCA_DF = pd.DataFrame(princ_comp) |
| pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum() |
| idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0] |
| df_pca = PCA_DF.loc[:, 0:idx] |
| df_pca[flag]=df[flag] |
| print(df_pca) |
| |
| df_train = df_pca[df_pca[flag] == 1] |
| df_control = df_pca[df_pca[flag] == 0] |
| df_control_sample = df_control.sample(n=control_sample_size, random_state=42) |
| final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True) |
| non_req_cols=[flag] |
| req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)] |
| |
| identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]] |
| if model_type == 'linear': |
| |
| |
| |
| |
| |
| model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights) |
| model.fit(X, y) |
| |
| coefs = model.coef_[0] |
| feats = X.columns |
| importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs}) |
| importance_df['abs_coef'] = np.abs(importance_df['coefficients']) |
| elif model_type == 'xgboost': |
| model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta) |
| model.fit(X, y) |
| importance = model.feature_importances_ |
| feats = X.columns |
| importance_df = pd.DataFrame({'features':feats, 'Importance':importance}) |
|
|
| |
| Y_pred = model.predict(X) |
| |
| |
| cm = confusion_matrix(y, Y_pred) / len(y) |
|
|
| |
| classes = np.unique(y) |
| df_cm = pd.DataFrame(cm, index=classes, columns=classes) |
|
|
| |
| hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j]) |
| for j in range(len(classes))] for i in range(len(classes))] |
|
|
| |
| fig = ff.create_annotated_heatmap(z=df_cm.values, |
| x=list(classes), |
| y=list(classes), |
| colorscale='blues', |
| hoverinfo='text', |
| text=hover_text) |
|
|
| |
| fig.update_layout( |
| title='Confusion Matrix', |
| xaxis_title='Predicted', |
| yaxis_title='Actual', |
| font=dict(size=14) |
| ) |
|
|
| |
| |
| |
| report = classification_report(y, Y_pred, output_dict=True) |
| |
| report_df = pd.DataFrame(report).transpose() |
| |
| X, y = df_pca[req_cols], df_pca[[flag]] |
| |
| |
| |
| |
| |
| |
| y_pred_proba = model.predict_proba(X) |
| y_pred_df = pd.DataFrame(y_pred_proba) |
| df_pca.insert(0, 'propensity_score', y_pred_df[1]) |
| |
| |
| |
| st.subheader("Classification Report") |
| st.dataframe(report_df,width=600) |
|
|
| |
| |
| |
|
|
| |
| st.subheader("Confusion matrix") |
| st.plotly_chart(fig) |
| return df_pca[['propensity_score']] |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
|
|
|
|
|
|
| st.title("Algorithms") |
|
|
| |
| |
|
|
| if 'classification_option' not in st.session_state: |
| st.session_state.classification_option = "Classification" |
| if 'algorithm_option' not in st.session_state: |
| st.session_state.algorithm_option = "Logistic Regression" |
|
|
| classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option") |
|
|
| if classification_option != st.session_state.classification_option: |
| st.session_state.classification_option = classification_option |
|
|
| if st.session_state.classification_option == "Classification": |
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.write("#####") |
| lr_checkbox = st.checkbox( |
| label="Logistic Regression", |
| key="algorithm_lr_cb", |
| value=(st.session_state.algorithm_option == "Logistic Regression") |
| ) |
|
|
| with col2: |
| st.write("#####") |
| show_lr_options = st.checkbox( |
| label="Change default options", |
| key="lr_options_cb", |
| disabled=not lr_checkbox, |
| ) |
|
|
| cols = st.columns((2, 1)) |
| with cols[0]: |
| lr_hyp_placeholder = st.empty() |
| lr_model_placeholder = st.empty() |
|
|
| solver='lbfgs' |
| class_weights=None |
| max_iter=1000 |
| if show_lr_options and lr_checkbox: |
| with lr_hyp_placeholder: |
| with st.expander("LR parameters"): |
| solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag']) |
| max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000) |
| class_weight_option = st.selectbox( |
| 'Select class weights option:', |
| ('Custom', 'Balanced') |
| ) |
|
|
| if class_weight_option == 'Custom': |
| weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1) |
| weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1) |
| class_weights = {1: weight_1, 0: weight_0} |
| elif class_weight_option == 'Balanced': |
| class_weights = {1: 0.5, 0: 0.5} |
| |
|
|
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.write("#####") |
| xgb_checkbox = st.checkbox( |
| label="Xgboost Classifier", key="algorithm_xgb_cb", |
| value=(st.session_state.algorithm_option == "Xgboost Classifier") |
| ) |
|
|
| with col2: |
| st.write("#####") |
| show_xgb_options = st.checkbox( |
| label="Change default options", |
| key="xgb_options_cb", |
| disabled=not xgb_checkbox, |
| ) |
|
|
| cols = st.columns((2, 1)) |
| with cols[0]: |
| xgb_hyp_placeholder = st.empty() |
|
|
| max_depth=None |
| subsample=None |
| eta=None |
|
|
| if show_xgb_options and xgb_checkbox: |
| with xgb_hyp_placeholder: |
| with st.expander("XGB hyper parameters"): |
| max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1) |
| subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1) |
| eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01) |
| |
| st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier" |
|
|
| elif classification_option == "Regression": |
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.write("#####") |
| lr_checkbox = st.checkbox( |
| label="Linear Regression", |
| key="algorithm_lr_cb", |
| value=(st.session_state.algorithm_option == "Linear Regression") |
| ) |
|
|
| with col2: |
| st.write("#####") |
| show_lr_options = st.checkbox( |
| label="Change default options", |
| key="lr_options_cb", |
| disabled=not lr_checkbox, |
| ) |
|
|
| cols = st.columns((2, 1)) |
| with cols[0]: |
| lr_hyp_placeholder = st.empty() |
| lr_model_placeholder = st.empty() |
|
|
| solver='lbfgs' |
| class_weights=None |
| max_iter=1000 |
| if show_lr_options and lr_checkbox: |
| with lr_hyp_placeholder: |
| with st.expander("LR parameters"): |
| solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag']) |
| max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000) |
| class_weight_option = st.selectbox( |
| 'Select class weights option:', |
| ('Custom', 'Balanced') |
| ) |
|
|
| if class_weight_option == 'Custom': |
| weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1) |
| weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1) |
| class_weights = {1: weight_1, 0: weight_0} |
| elif class_weight_option == 'Balanced': |
| class_weights = {1: 0.5, 0: 0.5} |
|
|
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.write("#####") |
| xgb_checkbox = st.checkbox( |
| label="Xgboost Regression", key="algorithm_xgb_cb", |
| value=(st.session_state.algorithm_option == "Xgboost Regression") |
| ) |
|
|
| with col2: |
| st.write("#####") |
| show_xgb_options = st.checkbox( |
| label="Change default options", |
| key="xgb_options_cb", |
| disabled=not xgb_checkbox, |
| ) |
|
|
| cols = st.columns((2, 1)) |
| with cols[0]: |
| xgb_hyp_placeholder = st.empty() |
|
|
| max_depth=None |
| subsample=None |
| eta=None |
|
|
| if show_xgb_options and xgb_checkbox: |
| with xgb_hyp_placeholder: |
| with st.expander("XGB hyper parameters"): |
| max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1) |
| subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1) |
| eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01) |
| st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression" |
|
|
| with cols[0]: |
| control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1])) |
|
|
| |
| |
|
|
| if st.button("Run Modeling"): |
| if lr_checkbox: |
| st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights) |
| elif xgb_checkbox: |
| st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta) |
|
|
|
|
| |
| st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1] |
| st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0] |
|
|
|
|