| import numpy as np |
| import pandas as pd |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.model_selection import GridSearchCV |
| import matplotlib.pyplot as plt |
| from tqdm import tqdm |
| from matplotlib.ticker import MaxNLocator |
| import streamlit as st |
| import ast |
| from collections import defaultdict |
| from scipy.cluster.hierarchy import linkage, fcluster, dendrogram |
| from sklearn.cluster import KMeans, AgglomerativeClustering |
| from sklearn.preprocessing import LabelEncoder |
| |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| |
| import warnings |
| import pandas as pd |
| import numpy as np |
| from scipy import stats |
| import scipy.cluster.hierarchy as sch |
| from scipy.spatial.distance import pdist |
| import os |
| import re |
| import time |
| from plotly.subplots import make_subplots |
| import plotly.graph_objects as go |
| import numpy as np |
| import plotly.express as px |
| import base64 |
|
|
|
|
| def tree_based_bin_data(df, column_name, dep_var, depth_of_tree): |
| df2 = df.copy() |
| df2 = df2.loc[df2[column_name].notnull()] |
| x = df2[column_name].values.reshape(-1, 1) |
| y = df2[dep_var].values |
| params = {'max_depth': range(2, depth_of_tree + 1), 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [int(np.ceil(0.05 * len(x)))]} |
| clf = DecisionTreeClassifier() |
| g_search = GridSearchCV(clf, param_grid=params, scoring='accuracy') |
| g_search.fit(x, y) |
| best_clf = g_search.best_estimator_ |
| bin_edges = best_clf.tree_.threshold |
| bin_edges = sorted(set(bin_edges[bin_edges != -2])) |
| tree_based_binned_data = value_bin_data(df, column_name, bin_edges) |
| return tree_based_binned_data |
|
|
|
|
| def decile_bin_data(df, col, no_of_bins): |
| decile_binned_data = pd.qcut(df[col], no_of_bins, duplicates='drop') |
| return decile_binned_data |
|
|
|
|
| def value_bin_data(df, col, no_of_bins): |
| value_binned_data = pd.cut(df[col], no_of_bins, duplicates='drop') |
| return value_binned_data |
|
|
|
|
| def col_bin_summary_numerical(bin_df, col, dep_var=None): |
| unique_bin_edges = bin_df[col].unique() |
| df_new = pd.DataFrame({"bin_ranges": unique_bin_edges}) |
|
|
| try: |
| df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={'index': 'bin_ranges', col: 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2) |
| except: |
| df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2) |
| if dep_var is not None: |
| df_new = df_new.merge(bin_df.groupby(col)[dep_var].sum().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges', how='left') |
| df_new = df_new.merge(bin_df.groupby(col)[dep_var].mean().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges', how='left') |
| df_new['Index'] = (100 * df_new['Mean_DV'] / bin_df['Y'].mean()).round() |
| df_new = df_new[['bin_ranges', 'count%', 'Event', 'Mean_DV', 'Index']] |
| df_new = df_new.sort_values(by='bin_ranges') |
|
|
| return df_new |
|
|
|
|
|
|
|
|
|
|
| def plot_chart(df, col, dep_var): |
| |
| df['bin_ranges_str'] = df['bin_ranges'].astype(str) |
| fig = make_subplots(specs=[[{"secondary_y": True}]]) |
| |
|
|
| fig.add_trace( |
| go.Bar( |
| x=df['bin_ranges_str'], |
| y=df['count%'], |
| name='Count%', |
| marker_color='#053057', |
| hovertemplate=( |
| f"Bin: %{{x}}<br>" |
| f"Count%: %{{y}}" |
| ), |
| ) |
| ) |
|
|
| |
| fig.add_trace( |
| go.Scatter( |
| x=df['bin_ranges_str'], |
| y=df['Index'], |
| mode='lines+markers', |
| name='Index', |
| marker=dict(color="#8ac4f8"), |
| hovertemplate=( |
| f"Bin: %{{x}}<br>" |
| f"Index%: %{{y}}" |
| ), |
| ), |
| secondary_y=True |
| ) |
|
|
| |
| fig.update_layout( |
| title=f'Distribution of {col}', |
| xaxis=dict(title='Bin_ranges'), |
| yaxis=dict(title='Count%', color='#053057'), |
| yaxis2=dict(title='Index', color="#8ac4f8", overlaying='y', side='right'), |
| legend=dict(x=1.02, y=0.98), |
| hovermode='x' |
| ) |
|
|
| fig.update_xaxes(showgrid=False) |
| fig.update_yaxes(showgrid=False) |
|
|
| return fig |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
|
|
|
|
|
|
|
|
|
|
| def create_numerical_binned_data(df, col, func,no_of_bins=None,dep_var=None, depth=None): |
| df_org = df.copy() |
|
|
| if dep_var is not None: |
| df_org[dep_var] = df_org[dep_var].astype('int64') |
| df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1) |
|
|
| if func == 'tree': |
| bin_df = tree_based_bin_data(df, col, dep_var, depth) |
| elif func == 'decile': |
| bin_df = decile_bin_data(df_num, col, 10) |
| else: |
| bin_df = value_bin_data(df_num, col, no_of_bins) |
|
|
| bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1) |
| else: |
| df_num = df_org.select_dtypes(include=[np.number]) |
|
|
| if func == 'decile': |
| bin_df = decile_bin_data(df_num, col, no_of_bins) |
| else: |
| bin_df = value_bin_data(df_num, col, no_of_bins) |
|
|
| df_summary = col_bin_summary_numerical(bin_df,col, dep_var) |
|
|
| return df_summary |
|
|
|
|
| def create_numerical_binned_data1(df, col, func,no_of_bins,dep_var,depth=None): |
| df_org = df.copy() |
|
|
| df_org[dep_var] = df_org[dep_var].astype('int64') |
| df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1) |
|
|
| if func == 'tree': |
| bin_df = tree_based_bin_data(df, col, dep_var, depth) |
| elif func == 'decile': |
| bin_df = decile_bin_data(df_num, col, no_of_bins) |
| else: |
| bin_df = value_bin_data(df_num, col, no_of_bins) |
|
|
| bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1) |
|
|
| binned_data=pd.DataFrame() |
| binned_data[col]=df_org[col] |
| unique_bins = bin_df[col].unique() |
| for bin_value in unique_bins: |
| bin_column_name = f"{col}_{bin_value}" |
| binned_data[bin_column_name] = np.where(binned_data[col] == bin_value, df_org[col], 0) |
|
|
| return binned_data |
|
|
|
|
| |
|
|
| def woe_iv(df, column_name, dep_var, no_of_bins): |
| y0 = df[dep_var].value_counts()[0] |
| y1 = df[dep_var].value_counts()[1] |
| if df[column_name].nunique() < 10: |
| data = pd.Series(pd.factorize(df[column_name])[0] + 1, index=df.index).rename('{}'.format(column_name)).apply(lambda x: f'bin{x}') |
| else: |
| df_woe_iv = (pd.crosstab(df[column_name], df[dep_var], normalize='columns').assign(woe=lambda dfx: np.log((dfx[1] + (0.5 / y1)) / (dfx[0] + (0.5 / y0)))).assign(iv=lambda dfx: (dfx['woe'] * (dfx[1] - dfx[0])))) |
| woe_map = df_woe_iv['woe'].to_dict() |
| woe_col = df[column_name].map(woe_map) |
| data = pd.qcut(woe_col, no_of_bins, duplicates='drop') |
| n = data.nunique() |
| labels = [f'bin{i}' for i in range(1, n + 1)] |
| data = data.cat.rename_categories(labels) |
| sizes = data.value_counts(normalize=True) |
| min_size = 0.05 |
| while sizes.min() < min_size and no_of_bins > 1: |
| no_of_bins -= 1 |
| data = pd.qcut(woe_col, q=no_of_bins, duplicates='drop') |
| if data.nunique() != data.cat.categories.nunique(): |
| continue |
| n = data.nunique() |
| labels = [f'bin{i}' for i in range(1, n + 1)] |
| data = data.cat.rename_categories(labels) |
| sizes = data.value_counts(normalize=True) |
| return data |
|
|
| def naive_cat_bin(df, col, max_thre=10, min_thre=5, tolerence=2, flag='ignore'): |
| value_counts = df[col].value_counts() |
| total_values = len(df) |
| count_percentages = (value_counts / total_values) * 100 |
| unique_values_df = pd.DataFrame({'Category': value_counts.index, 'Count Percentage': count_percentages}) |
| count_per = list(unique_values_df['Count Percentage']) |
|
|
| final_ini = [] |
| for i in count_per: |
| if i >= min_thre: |
| final_ini.append(i) |
| a = [x for x in count_per if x not in final_ini] |
|
|
| total_bins = int(100 / max_thre) |
| ava_bins = len(final_ini) |
| ava_bin_per = sum(final_ini) |
| bin_req = total_bins - ava_bins |
| bin_req_per = 100 - ava_bin_per |
|
|
| if flag == 'error' and bin_req > 0 and (bin_req_per / bin_req) > max_thre: |
| print(f"Binning for {col} is not possible with given parameters.") |
| return |
|
|
| step = False |
| while not step: |
| if bin_req > 0: |
| if (bin_req_per / bin_req) > min_thre: |
| step = True |
| else: |
| bin_req -= 1 |
| else: |
| step = True |
|
|
| final_ini = [[x] for x in final_ini] |
|
|
| if bin_req > 0: |
| target_sum = bin_req_per / bin_req |
| else: |
| target_sum = bin_req_per |
| tolerence = 0 |
|
|
| final = [] |
| current_sum = 0.0 |
| start_index = len(a) - 1 |
| values = [] |
| while start_index >= 0: |
| current_sum += a[start_index] |
| values.append(a[start_index]) |
| if current_sum < target_sum - tolerence: |
| start_index -= 1 |
| else: |
| final.append(values) |
| values = [] |
| start_index -= 1 |
| current_sum = 0.0 |
| final.append(values) |
| final = final[::-1] |
| final = [sublist for sublist in final if sublist] |
| final_b = final_ini + final |
|
|
| final = [final_b[0]] |
| for subarr in final_b[1:]: |
| if sum(subarr) < (min_thre - tolerence): |
| final[-1].extend(subarr) |
| else: |
| final.append(subarr) |
|
|
| table = dict(zip(unique_values_df['Category'], unique_values_df['Count Percentage'])) |
| new_final = [sublist.copy() for sublist in final] |
|
|
| table_reverse = defaultdict(list) |
| for k, v in table.items(): |
| table_reverse[v].append(k) |
|
|
| output = [] |
| for l in new_final: |
| temp = [] |
| for item in l: |
| temp.append(table_reverse[item].pop()) |
| output.append(temp) |
| new_final = output |
|
|
| k = len(new_final) |
| bin_labels = [f'bin{i}' for i in range(1, k + 1)] |
| bin_mapping = {value: bin_labels[i] for i, sublist in enumerate(new_final) for value in sublist} |
| bin_mapping[np.nan] = 'binNA' |
| return df[col].apply(lambda x: bin_mapping.get(x, x)) |
|
|
| def col_bin_summary_categorical(df_cat, col, binned_df_1,dep_var=None): |
| unique_values_in_bins = df_cat.groupby(binned_df_1[col])[col].unique().apply(list) |
| unique_values_in_bins = unique_values_in_bins.rename_axis('bin').reset_index() |
| unique_bin_ranges = pd.Categorical(binned_df_1[col].unique()) |
| uni = binned_df_1[col].nunique() |
| numeric_parts = [uni if val == 'binNA' else int(re.findall(r'\d+', val)[0]) for val in unique_bin_ranges] |
| unique_bin_ranges = unique_bin_ranges[np.argsort(numeric_parts)] |
| df_new_cat = pd.DataFrame({"column_name": [col] * len(unique_bin_ranges), "bin_ranges": unique_bin_ranges}) |
| df_new_cat = df_new_cat.merge(unique_values_in_bins.rename(columns={'bin': 'bin_ranges', col: 'values in bin'})) |
| df_new_cat = df_new_cat.merge((binned_df_1[col].value_counts() / len(binned_df_1) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2) |
| if dep_var is not None: |
| df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].sum(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges') |
| df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].mean(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges') |
| df_new_cat['Index'] = (100 * df_new_cat['Mean_DV'] / binned_df_1[dep_var].mean()).round() |
| return df_new_cat |
|
|
| def create_categorical_binned_data(imputed_df,col, categorical_binning, dep_var, no_of_bins=None, max_thre=None, min_thre=None,tolerence=2, flag='ignore'): |
| |
| imputed_df[dep_var] = imputed_df[dep_var].astype('int64') |
| df_cat = imputed_df.select_dtypes(include=['object']) |
| |
| unique_counts = df_cat.nunique() |
| unique_cols = unique_counts[unique_counts == 1].index.tolist() |
| df_cat = df_cat.drop(unique_cols, axis=1) |
| |
| if categorical_binning == 'woe_iv': |
| df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1) |
| tqdm.pandas(dynamic_ncols=True, position=0) |
| binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dep_var, no_of_bins)) |
| binned_df_nominal.drop(dep_var, axis=1, inplace=True) |
| binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x) |
| binned_df_nominal = binned_df_nominal.astype('category') |
|
|
| cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
| binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
| binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1) |
| elif categorical_binning == 'naive': |
| df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1) |
| tqdm.pandas(dynamic_ncols=True, position=0) |
| binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore')) |
| binned_df_nominal.drop(dep_var, axis=1, inplace=True) |
| binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all') |
| binned_df_nominal = binned_df_nominal.astype('category') |
|
|
| cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
| binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
| binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1) |
|
|
| df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dep_var) |
| return df_summary |
|
|
| def create_categorical_binned_data1(imputed_df,col, nominal_binning, dependant_target_variable, no_of_bins=10, max_thre=10, min_thre=5, tolerence=2, flag='ignore', min_cluster_size=0.05, max_clusters=10): |
| |
| imputed_df[dependant_target_variable] = imputed_df[dependant_target_variable].astype('int64') |
| df_cat = imputed_df.select_dtypes(include=['object']) |
| |
| unique_counts = df_cat.nunique() |
| unique_cols = unique_counts[unique_counts == 1].index.tolist() |
| df_cat = df_cat.drop(unique_cols, axis=1) |
| |
| if nominal_binning == 'woe': |
| df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1) |
| tqdm.pandas(dynamic_ncols=True, position=0) |
| binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dependant_target_variable, no_of_bins)) |
| binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True) |
| binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x) |
| binned_df_nominal = binned_df_nominal.astype('category') |
|
|
| cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
| binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
| binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1) |
| elif nominal_binning == 'naive': |
| df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1) |
| tqdm.pandas(dynamic_ncols=True, position=0) |
| binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore')) |
| binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True) |
| binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all') |
| binned_df_nominal = binned_df_nominal.astype('category') |
|
|
| cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
| binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
| binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1) |
|
|
| df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dependant_target_variable) |
|
|
| binned_data = pd.DataFrame() |
| for bin_value in df_summary['values in bin']: |
| bin_column_name = f"{col}_{bin_value}" |
| binned_data[bin_column_name] = np.where(df_cat[col].isin(bin_value), 1, 0) |
|
|
| return binned_data |
|
|
|
|
|
|
| numerical_columns = st.session_state.imputed_df.select_dtypes(include=['number']).columns.tolist() |
| numerical_columns = [x for x in numerical_columns if x != st.session_state.flag] |
| categorical_columns = st.session_state.imputed_df.select_dtypes(include=['object', 'category']).columns.tolist() |
| categorical_columns = [x for x in categorical_columns if x != st.session_state.identifier] |
| st.session_state.numerical_columns=numerical_columns |
| st.session_state.categorical_columns=categorical_columns |
|
|
|
|
| st.title("Variable Profiling") |
|
|
| |
| function_num = st.session_state.get("function_num", "value") |
| depth = st.session_state.get("depth", 3) |
| num_bins = st.session_state.get("num_bins", 10) |
| function_cat = st.session_state.get("function_cat", "woe_iv") |
| max_slider = st.session_state.get("max_slider", 10) |
| min_slider = st.session_state.get("min_slider", 5) |
| cat_bins_iv = st.session_state.get("cat_bins_iv", 10) |
| cat_bins_naive = st.session_state.get("cat_bins_naive", 10) |
|
|
| with st.expander("Profiling Inputs"): |
| st.write("Binning Inputs") |
| ui_columns = st.columns((1, 1)) |
| with ui_columns[0]: |
| function_num = st.selectbox( |
| label="Select Numerical Binning Function", |
| options=['value', 'tree'], |
| |
| index=['value', 'tree'].index(st.session_state.function_num) if 'function_num' in st.session_state and st.session_state.function_num is not None else None |
| ) |
| st.session_state.function_num = function_num |
| params_num = st.empty() |
|
|
| with params_num: |
| with ui_columns[-1]: |
| if function_num == 'tree': |
| depth = st.slider( |
| label="Depth", |
| min_value=1, |
| max_value=10, |
| value=depth, |
| key='depth_slider') |
| st.session_state.depth = depth |
| elif function_num == 'value': |
| num_bins = st.slider( |
| label="Number of Bins", |
| min_value=2, |
| max_value=20, |
| value=num_bins, |
| key='num_bins_slider_num') |
| st.session_state.num_bins = num_bins |
| left, right = st.columns(2) |
|
|
| with left: |
| function_cat = st.selectbox( |
| label="Select Categorical Binning Function", |
| options=['woe_iv', 'naive'], |
| |
| index=['woe_iv', 'naive'].index(st.session_state.function_cat) if 'function_cat' in st.session_state and st.session_state.function_cat is not None else None |
| ) |
| st.session_state.function_cat = function_cat |
| params_cat = st.empty() |
|
|
| with params_cat: |
|
|
| if function_cat == 'woe_iv': |
| with right: |
| cat_bins_iv = st.slider( |
| label="Number of Bins", |
| min_value=2, |
| max_value=20, |
| value=cat_bins_iv, |
| key='num_bins_slider_cat_iv') |
| st.session_state.cat_bins_iv = cat_bins_iv |
| with left: |
| min_slider = st.slider( |
| label="Min Threshold", |
| min_value=1, |
| max_value=100, |
| value=min_slider, |
| key='min_slider') |
| st.session_state.min_slider = min_slider |
| with right: |
| max_slider = st.slider( |
| label="Max Threshold", |
| min_value=1, |
| max_value=100, |
| value=max_slider, |
| key='max_slider') |
| st.session_state.max_slider = max_slider |
| elif function_cat == 'naive': |
| with right: |
| cat_bins_naive = st.slider( |
| label="Number of Bins", |
| min_value=2, |
| max_value=20, |
| value=cat_bins_naive, |
| key='num_bins_slider_cat_naive') |
| st.session_state.cat_bins_naive = cat_bins_naive |
|
|
| with left: |
| st.write("#") |
| perform_profiling = st.button( |
| label="Perform profiling" |
| ) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| if perform_profiling: |
| with st.expander("Profiling summary"): |
| st.write("Numerical binned data") |
| binned_data_num = pd.DataFrame() |
| for col in st.session_state.numerical_columns: |
| if function_num == 'tree': |
| depth = depth |
| else: |
| depth=None |
| if function_num == 'value': |
| num_bins=num_bins |
| else: |
| num_bins=None |
| binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth) |
| binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str)) |
| binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0) |
| st.dataframe(binned_data_num,use_container_width=True,hide_index=True) |
|
|
| st.write("Categorical binned data") |
| binned_data_cat = pd.DataFrame() |
| for col in st.session_state.categorical_columns: |
| if function_cat == 'woe_iv': |
| max_thre = max_slider |
| min_thre = min_slider |
| no_of_bins = cat_bins_iv |
| else: |
| max_thre = None |
| min_thre = None |
| no_of_bins = None |
| if function_cat == 'naive': |
| no_of_bins = cat_bins_naive |
| else: |
| no_of_bins=None |
| binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore') |
| binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str)) |
| binned_data_col_cat.drop('column_name',axis=1,inplace=True) |
| binned_data_cat = pd.concat([binned_data_cat, binned_data_col_cat],axis=0) |
| st.dataframe(binned_data_cat,use_container_width=True,hide_index=True) |
|
|
|
|
| with st.expander("Profiling summary: Plots"): |
| st.markdown( |
| "<p class='plot-header'>Change the selected variable to plot" |
| " different charts</p>", |
| unsafe_allow_html=True, |
| ) |
| left, right = st.columns(2) |
| with left: |
| if 'selected_variable' not in st.session_state: |
| st.session_state.selected_variable = [] |
|
|
| selected_variable = st.selectbox( |
| "Variable", |
| st.session_state.numerical_columns + st.session_state.categorical_columns, |
| |
| ) |
| if isinstance(selected_variable, str): |
| selected_variable = [selected_variable] |
|
|
| |
| st.session_state.selected_variable = selected_variable |
|
|
|
|
| |
| if st.session_state.selected_variable: |
| for col in st.session_state.selected_variable: |
| if col in st.session_state.numerical_columns: |
| if function_num == 'tree': |
| depth = depth |
| else: |
| depth = None |
| if function_num == 'value': |
| num_bins = num_bins |
| else: |
| num_bins = None |
| binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth) |
| binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str)) |
| fig = plot_chart(binned_data_col, col, dep_var=None) |
| st.plotly_chart(fig, use_container_width=True) |
|
|
| elif col in st.session_state.categorical_columns: |
| if function_cat == 'woe_iv': |
| max_thre = max_slider |
| min_thre = min_slider |
| no_of_bins = cat_bins_iv |
| else: |
| max_thre = None |
| min_thre = None |
| no_of_bins = None |
| if function_cat == 'naive': |
| no_of_bins = cat_bins_naive |
| else: |
| no_of_bins = None |
| binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore') |
| binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str)) |
| binned_data_col_cat.drop('column_name', axis=1, inplace=True) |
| fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None) |
| st.plotly_chart(fig_cat, use_container_width=True) |
|
|
|
|
| st.divider() |
| |
| binned_data_combined = pd.DataFrame() |
| |
| |
| for col in st.session_state.numerical_columns: |
| if function_num == 'tree': |
| depth = depth |
| else: |
| depth=None |
| if function_num == 'value': |
| num_bins=num_bins |
| else: |
| num_bins=None |
| |
| binned_data_num = create_numerical_binned_data1(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth) |
| binned_data_combined = pd.concat([binned_data_combined, binned_data_num], axis=1) |
|
|
| |
| for col in st.session_state.categorical_columns: |
| if function_cat == 'woe_iv': |
| max_thre = max_slider |
| min_thre = min_slider |
| no_of_bins = cat_bins_iv |
| else: |
| max_thre = None |
| min_thre = None |
| no_of_bins = None |
| if function_cat == 'naive': |
| no_of_bins = cat_bins_naive |
| else: |
| no_of_bins=None |
| |
| binned_data_cat = create_categorical_binned_data1(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore') |
| binned_data_combined = pd.concat([binned_data_combined, binned_data_cat], axis=1) |
| def clean_column_name(column_name): |
| |
| return re.sub(r'\.(\d+)', '', column_name) |
| binned_data_combined.columns = binned_data_combined.columns.map(clean_column_name) |
| valid_feature_names = [name.replace('[', '').replace(']', '').replace('<', '').replace(',', '_').replace('(', '').replace("'", '') for name in binned_data_combined.columns] |
| valid_feature_names = [name.replace(' ', '').replace(' ', '') for name in valid_feature_names] |
| binned_data_combined.columns = valid_feature_names |
| |
| st.session_state.binned_df = binned_data_combined |
| st.session_state.binned_df[st.session_state.flag]=st.session_state.imputed_df[st.session_state.flag] |
| st.session_state.binned_df.insert(0, st.session_state.identifier, st.session_state.imputed_df[st.session_state.identifier]) |
| print(st.session_state.binned_df['individual_id_ov']) |
| |
| st.markdown("Binned DataFrame") |
| st.dataframe(binned_data_combined.head(10), use_container_width=True, hide_index=True) |
|
|
| |
| if st.session_state.binned_df is not None: |
| |
| download_button = st.download_button( |
| label="Download Binned Data as CSV", |
| data=st.session_state.binned_df.to_csv(index=False).encode(), |
| file_name='binned_data.csv', |
| mime='text/csv', |
| ) |
|
|
|
|
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|