| import streamlit as st |
| import pandas as pd |
| import numpy as np |
| from sklearn.neighbors import NearestNeighbors |
| from sklearn.preprocessing import StandardScaler |
| import xgboost as xgb |
| import base64 |
| import streamlit as st |
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.neighbors import NearestNeighbors |
| from math import sqrt |
| from statistics import mean, variance |
| import seaborn as sns |
|
|
| import plotly.graph_objects as go |
|
|
| def cohend_plot_function(std_mean_diff_df2, std_mean_diff_df, selected_attributes): |
| |
| fig = go.Figure() |
|
|
| x = std_mean_diff_df2[std_mean_diff_df2["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1] |
| y = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1]) |
|
|
| x1 = std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1] |
| y1 = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1]) |
|
|
| |
| fig.add_trace(go.Scatter( |
| x=x, |
| y=y, |
| mode='markers', |
| marker=dict(color='blue'), |
| name='general_control_cohend' |
| )) |
|
|
| fig.add_trace(go.Scatter( |
| x=x1, |
| y=y1, |
| mode='markers', |
| marker=dict(color='orange', symbol='diamond-open'), |
| name='synthetic_control_cohend' |
| )) |
|
|
| |
| for val in [-0.1, 0.1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75]: |
| fig.add_shape( |
| type="line", |
| x0=val, |
| y0=0, |
| x1=val, |
| y1=10, |
| line=dict( |
| color="gray", |
| width=1, |
| dash="dash", |
| ) |
| ) |
|
|
| |
| fig.add_shape( |
| type="line", |
| x0=0, |
| y0=0, |
| x1=0, |
| y1=10, |
| line=dict( |
| color="black", |
| width=1, |
| ) |
| ) |
|
|
| |
| fig.update_layout( |
| xaxis=dict( |
| title='cohend', |
| range=[-1, 1] |
| ), |
| yaxis=dict( |
| title='Metrics', |
| autorange="reversed" |
| ), |
| legend=dict( |
| orientation="h", |
| yanchor="bottom", |
| y=1.02, |
| xanchor="right", |
| x=1 |
| ) |
| ) |
|
|
| |
| st.plotly_chart(fig,use_container_width=True) |
|
|
|
|
| def plot_comparison(comparison_df): |
| fig = go.Figure() |
|
|
| |
| fig.add_trace(go.Bar( |
| x=comparison_df.index, |
| y=comparison_df[comparison_df.columns[0]], |
| name='Treatment', |
| marker=dict(color='#053057'), |
| )) |
|
|
| fig.add_trace(go.Bar( |
| x=comparison_df.index, |
| y=comparison_df[comparison_df.columns[1]], |
| name='Control', |
| marker=dict(color='#8ac4f8'), |
| )) |
|
|
| |
| fig.update_layout( |
| xaxis=dict( |
| title='quartiles' |
| ), |
| yaxis=dict( |
| title='values' |
| ), |
| barmode='group', |
| title=comparison_df.columns[0].split('treatment')[1][1:] |
| ) |
|
|
| |
| st.plotly_chart(fig,use_container_width=True) |
|
|
|
|
| def plot_propensity_distribution(treatment_data, control_data): |
| fig = go.Figure() |
|
|
| |
| fig.add_trace(go.Histogram( |
| x=treatment_data, |
| name='Treatment', |
| marker=dict(color='#053057'), |
| opacity=0.6 |
| )) |
|
|
| fig.add_trace(go.Histogram( |
| x=control_data, |
| name='Control', |
| marker=dict(color='#8ac4f8'), |
| opacity=0.6 |
| )) |
|
|
| |
| fig.update_layout( |
| xaxis=dict( |
| title='propensity_score' |
| ), |
| yaxis=dict( |
| title='count' |
| ), |
| barmode='overlay', |
| title='Propensity Distribution' |
| ) |
|
|
| |
| st.plotly_chart(fig,use_container_width=True) |
|
|
| def comparison(df, variable): |
| |
| treatment_values = df[df.Y==1].groupby('quartiles')[variable].mean() |
| control_values = df[df.Y==0].groupby('quartiles')[variable].mean() |
| comparison = pd.merge(treatment_values, control_values, left_index=True, right_index=True) |
| comparison.rename({f'{variable}_x': f'treatment_{variable}', f'{variable}_y': f'control_{variable}'}, axis=1, inplace=True) |
| comparison['difference'] = np.abs(comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) |
| comparison['percent_difference'] = np.abs((comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) / comparison[f'treatment_{variable}']) |
| return comparison |
|
|
|
|
| |
|
|
| def cohend(d1, d2): |
| n1, n2 = len(d1), len(d2) |
| s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1) |
| s = sqrt(((n1-1) * s1 + (n2-1) * s2) / (n1 + n2 - 2)) |
| u1, u2 = mean(d1), mean(d2) |
| |
| if s == 0: |
| return 0 |
| else: |
| return (u1 - u2) / s |
|
|
| |
| def std_mean_diff(group_A_df, group_B_df): |
| cohend_values_arr = [0] * len(group_A_df.columns) |
| |
| for i in range(len(group_A_df.columns)): |
| cohend_values_arr[i] = cohend(group_A_df[group_A_df.columns[i]], group_B_df[group_A_df.columns[i]]) |
|
|
| cohend_array_pre_transp = [group_A_df.columns, cohend_values_arr] |
| np_array = np.array(cohend_array_pre_transp) |
| cohend_array = np.transpose(np_array) |
| |
| return cohend_array |
|
|
| |
| def cohend_code_function(binned_df, matching_df): |
| treat_df_complete = binned_df[binned_df['Y'] == 1] |
| control_df_complete = binned_df[binned_df['Y'] == 0] |
| treat_df_complete.drop('Y', axis =1, inplace = True) |
| control_df_complete.drop('Y', axis =1, inplace = True) |
| treatment_cust = pd.DataFrame() |
| control_cust = pd.DataFrame() |
| treatment_cust['individual_id_ov'] = matching_df["Id"] |
| control_cust['individual_id_ov'] = matching_df["matched_Id"] |
| |
| |
|
|
| group_A_df = treatment_cust[['individual_id_ov']] |
| group_A_df = group_A_df.merge(treat_df_complete, |
| how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
| group_B_df = control_cust[['individual_id_ov']] |
| group_B_df = group_B_df.merge(control_df_complete, |
| how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
| |
| group_A_df.drop('individual_id_ov', axis =1, inplace = True) |
| group_B_df.drop('individual_id_ov', axis =1, inplace = True) |
| |
| cohensd_df = std_mean_diff(group_A_df, group_B_df) |
| std_mean_diff_df = pd.DataFrame(columns=["Metrics","Cohend Value"]) |
| for i in range(len(cohensd_df)): |
| std_mean_diff_df.loc[len(std_mean_diff_df.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)] |
|
|
| std_mean_diff_df["flag"] = std_mean_diff_df.apply(lambda x : 1 if (x["Cohend Value"]>0.1 or x["Cohend Value"]<-0.1) else 0, axis =1) |
| st.write('Number of variables with standard mean difference between treatment and control is out of desired range (-0.1, 0.1): ', std_mean_diff_df["flag"].sum()) |
|
|
| |
| |
| st.write(std_mean_diff_df) |
| |
| |
|
|
| group_A_df = treatment_cust[['individual_id_ov']] |
| group_A_df = group_A_df.merge(treat_df_complete, |
| how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
| group_B_df = control_df_complete[['individual_id_ov']] |
| group_B_df = group_B_df.merge(control_df_complete, |
| how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
| |
| group_A_df.drop('individual_id_ov', axis =1, inplace = True) |
| group_B_df.drop('individual_id_ov', axis =1, inplace = True) |
| |
| cohensd_df = std_mean_diff(group_A_df, group_B_df) |
|
|
| std_mean_diff_df2 = pd.DataFrame(columns=["Metrics","Cohend Value"]) |
|
|
| for i in range(len(cohensd_df)): |
| std_mean_diff_df2.loc[len(std_mean_diff_df2.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)] |
| |
| return std_mean_diff_df2, std_mean_diff_df |
|
|
| def calculate_iv(df, flag, identifier): |
| df1 = df.drop([flag, identifier, 'propensity_score'], axis=1) |
| iv_df = pd.DataFrame(columns=['Feature', 'IV']) |
| for column in df1.columns: |
| data = pd.concat([pd.qcut(df1[column], q=10, duplicates='drop'), df[flag]], axis=1) |
| groups = data.groupby(by=column)[df[flag].name].agg(['count', 'sum']) |
| groups['event_rate'] = groups['sum'] / groups['count'] |
| groups['non_event_rate'] = (groups['count'] - groups['sum']) / groups['count'] |
| groups['WOE'] = np.log(groups['event_rate'] / groups['non_event_rate']) |
| groups['IV'] = (groups['event_rate'] - groups['non_event_rate']) * groups['WOE'] |
| iv = groups['IV'].sum() |
| iv_df = pd.concat([iv_df, pd.DataFrame({'Feature': [column], 'IV': [iv]})],axis=0, ignore_index=True) |
| return iv_df |
|
|
| def xgboost_feature_importance(df, flag,identifier): |
| X, y = df.drop([flag,identifier,'propensity_score'],axis=1), df[[flag]] |
| model = xgb.XGBClassifier() |
| model.fit(X, y) |
| importances = model.feature_importances_ |
| importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}) |
| importance_df = importance_df.sort_values(by='Importance', ascending=False) |
| return importance_df |
|
|
| |
| |
|
|
|
|
| def get_matching_pairs(identifier,treated_df, non_treated_df, sample_size_A, sample_size_B,matching_columns,flag): |
| |
| |
|
|
| treated_df = treated_df[matching_columns].sample(frac=sample_size_A/100) |
| non_treated_df = non_treated_df[matching_columns].sample(frac=sample_size_B/100) |
|
|
| treated_df = treated_df.set_index(st.session_state.identifier) |
| treated_df.drop(flag,axis=1,inplace=True) |
|
|
| non_treated_df = non_treated_df.set_index(st.session_state.identifier) |
| non_treated_df.drop(flag,axis=1,inplace=True) |
|
|
| treated_x = treated_df.values |
| non_treated_x = non_treated_df.values |
|
|
| scaler = StandardScaler() |
| scaler.fit(treated_x) |
| treated_x = scaler.transform(treated_x) |
| non_treated_x = scaler.transform(non_treated_x) |
|
|
|
|
| print("data transformaion completed") |
|
|
| nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x) |
|
|
| print("model fitting completed") |
|
|
| distances, indices = nbrs.kneighbors(treated_x) |
|
|
| print("matching completed") |
|
|
| indices = indices.reshape([1,indices.shape[0]*indices.shape[1]]) |
|
|
| res = [] |
| for i in list(treated_df.index): |
| for ele in range(1): |
| res.append(i) |
|
|
|
|
| output_df = pd.DataFrame() |
| output_df["Id"] = res |
| output_df["matched_Id"] = non_treated_df.iloc[indices[0]].index |
|
|
| return output_df |
|
|
| |
| st.title("Matching") |
|
|
| |
| iv_df = calculate_iv(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier) |
|
|
| |
| importance_df = xgboost_feature_importance(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier) |
|
|
| |
| combined_df = pd.merge(iv_df, importance_df, on='Feature', suffixes=('_iv', '_importance')) |
| combined_df['Avg_IV_Importance'] = (combined_df['IV'] + combined_df['Importance']) / 2 |
| combined_df.sort_values('Avg_IV_Importance',inplace=True,ascending=False) |
| |
| combined_df.insert(0, 'Select', False) |
| combined_df.reset_index(drop=True,inplace=True) |
|
|
| |
| st.subheader("Feature importances") |
| st.session_state["edited_df_combined"] = st.data_editor( |
| combined_df.style.hide(axis="index"), |
| column_config={ |
| "Select": st.column_config.CheckboxColumn(required=True) |
| }, |
| disabled=combined_df.drop("Select", axis=1).columns,use_container_width=True |
| ) |
|
|
| |
| top_features_input = st.number_input("Enter the number of top features", min_value=1, max_value=len(combined_df), value=None) |
|
|
| if top_features_input is not None: |
| |
| selected_df = combined_df.head(top_features_input) |
| selected_features = selected_df['Feature'].tolist() |
| else: |
| |
| selected_features = st.session_state.edited_df_combined[st.session_state.edited_df_combined['Select']]['Feature'].tolist() |
|
|
| |
| |
|
|
| selected_features.append(st.session_state.identifier) |
| selected_features.append(st.session_state.flag) |
| |
| st.session_state.selected_features = selected_features |
|
|
| with st.expander("Matching Inputs",expanded=True): |
| st.write("Matching Inputs") |
| ui_columns = st.columns((1, 1)) |
| with ui_columns[0]: |
| sample_size_A = st.slider("Sample Size for treatment Group", 1, 100, 100) |
| with ui_columns[1]: |
| sample_size_B = st.slider("Sample Size for Control Group", 1, 100, 100) |
| with ui_columns[0]: |
| st.write("#") |
| run_matching = st.button( |
| label="Run Matching" |
| ) |
| st.divider() |
| if run_matching: |
| matching_df = get_matching_pairs(st.session_state.identifier,st.session_state.treated_df, st.session_state.non_treated_df, sample_size_A, sample_size_B,st.session_state.selected_features,st.session_state.flag) |
| st.session_state.matching_df = matching_df |
| |
| st.dataframe(st.session_state.matching_df) |
| if st.session_state.matching_df is not None: |
| |
| download_button = st.download_button( |
| label="Download Matched Data as CSV", |
| data=st.session_state.matching_df.to_csv(index=False).encode(), |
| file_name='matching_data.csv', |
| mime='text/csv', |
| ) |
|
|
| |
| |
|
|
| st.subheader("Matching diagnostics") |
| control_group = st.session_state.binned_df[st.session_state.binned_df[st.session_state.identifier].isin(st.session_state.matching_df['matched_Id'])] |
| treatment_group = st.session_state.binned_df[st.session_state.binned_df.Y==1] |
|
|
| |
| combined_group = pd.concat([control_group, treatment_group]) |
| combined_group['quartiles'] = pd.qcut(combined_group['propensity_score'], 4, labels=False) |
|
|
| combined_group.drop(st.session_state.identifier,axis=1,inplace=True) |
| st.session_state.combined_group=combined_group |
|
|
| if 'perform_diagnostics' not in st.session_state: |
| st.session_state.perform_diagnostics = False |
|
|
| |
| perform_diagnostics = st.button(label="Run Diagnostics") |
|
|
| if perform_diagnostics or st.session_state.perform_diagnostics: |
| st.session_state.perform_diagnostics = True |
| with st.expander("Matching Diagnostics", expanded=True): |
| left, right = st.columns(2) |
| std_mean_diff_df2,std_mean_diff_df = cohend_code_function(st.session_state.binned_df, st.session_state.matching_df) |
| st.subheader("Cohen's d Plot") |
| cohend_plot_function(std_mean_diff_df2,std_mean_diff_df, selected_features) |
|
|
| |
| st.subheader("Pre-matching Propensity Distributions") |
| plot_propensity_distribution(st.session_state.binned_df[st.session_state.binned_df.Y == 1]['propensity_score'], st.session_state.binned_df[st.session_state.binned_df.Y == 0]['propensity_score']) |
|
|
| |
| st.subheader("Post-matching Propensity Distributions") |
| temp = pd.merge(left=st.session_state.matching_df, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='Id', right_on=st.session_state.identifier, how='left') |
| temp.drop(st.session_state.identifier, axis=1, inplace=True) |
| temp.rename({'Id': 'treatment_id', 'matched_Id': 'control_id', 'propensity_score': 'treatment_propensity'}, axis=1, inplace=True) |
| temp = pd.merge(left=temp, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='control_id', right_on=st.session_state.identifier, how='left') |
| temp.drop(st.session_state.identifier, axis=1, inplace=True) |
| temp.rename({'propensity_score': 'control_propensity'}, axis=1, inplace=True) |
|
|
| plot_propensity_distribution(temp['treatment_propensity'],temp['control_propensity']) |
|
|
|
|
|
|
| with st.expander("Comparison Plots",expanded=True): |
| st.markdown( |
| "<p class='plot-header'>Change the selected variable to plot" |
| " different charts</p>", |
| unsafe_allow_html=True, |
| ) |
| left, right = st.columns(2) |
| with left: |
| if 'selected_variable_comp' not in st.session_state: |
| st.session_state.selected_variable_comp = [] |
|
|
| selected_variable_comp = st.multiselect( |
| "Variable", |
| st.session_state.combined_group.columns, |
| st.session_state.selected_variable_comp |
| ) |
|
|
| |
| st.session_state.selected_variable_comp = selected_variable_comp |
|
|
| if st.session_state.selected_variable_comp: |
| |
| comparisons = {} |
| for var in st.session_state.selected_variable_comp: |
| comparisons[var] = comparison(combined_group, var) |
| plot_comparison(comparisons[var]) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|