Spaces:

BlendMMM
/

SCM

Sleeping

SCM / pages /3_Point estimates.py

Manoj

firt

6a04ca4 over 1 year ago

16 kB

	###### SUPER SAFE ######

	import pandas as pd
	import numpy as np
	import streamlit as st
	import pandas as pd
	import numpy as np
	import seaborn as sn
	import matplotlib.pyplot as plt
	from sklearn.linear_model import LogisticRegression
	from sklearn.preprocessing import MinMaxScaler, StandardScaler
	from sklearn.metrics import confusion_matrix, classification_report
	from sklearn.model_selection import train_test_split
	import xgboost as xgb
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error, r2_score
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler
	import numpy as np
	import plotly.figure_factory as ff


	st.set_page_config(
	layout="wide",
	)

	def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
	# if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
	# st.error("The identifier should not be common between flag values 0 and 1.")

	Xs = df.drop(columns=[identifier, flag],axis=1)
	X_scaled = StandardScaler().fit_transform(Xs)
	n_comp = len(Xs.columns)
	pca = PCA(n_components=n_comp)
	pca.fit(X_scaled)
	princ_comp = pca.transform(X_scaled)
	PCA_DF = pd.DataFrame(princ_comp)
	pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
	idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
	df_pca = PCA_DF.loc[:, 0:idx]
	df_pca[flag]=df[flag]
	print(df_pca)
	#creating train and control datasets
	df_train = df_pca[df_pca[flag] == 1]
	df_control = df_pca[df_pca[flag] == 0]
	df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
	final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
	non_req_cols=[flag]
	req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
	# create a holdout set
	identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
	if model_type == 'linear':
	# scale features
	# min_max_scaler = MinMaxScaler()
	# X_norm = min_max_scaler.fit_transform(X)
	#X_norm = (X - X.min()) / (X.max() - X.min())
	# fit model
	model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
	model.fit(X, y)
	#feature importances
	coefs = model.coef_[0]
	feats = X.columns
	importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
	importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
	elif model_type == 'xgboost':
	model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
	model.fit(X, y)
	importance = model.feature_importances_
	feats = X.columns
	importance_df = pd.DataFrame({'features':feats, 'Importance':importance})

	#Prediction
	Y_pred = model.predict(X)
	#Confusion matrix
	#cm = confusion_matrix(y, Y_pred)/y.shape[0]
	cm = confusion_matrix(y, Y_pred) / len(y)

	# Create DataFrame for confusion matrix
	classes = np.unique(y)
	df_cm = pd.DataFrame(cm, index=classes, columns=classes)

	# Create hover text
	hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
	for j in range(len(classes))] for i in range(len(classes))]

	# Create heatmap using Plotly with hover text
	fig = ff.create_annotated_heatmap(z=df_cm.values,
	x=list(classes),
	y=list(classes),
	colorscale='blues',
	hoverinfo='text',
	text=hover_text)

	# Update heatmap layout
	fig.update_layout(
	title='Confusion Matrix',
	xaxis_title='Predicted',
	yaxis_title='Actual',
	font=dict(size=14)
	)

	# Display Plotly figure in Streamlit
	#st.plotly_chart(fig)
	#classification report
	report = classification_report(y, Y_pred, output_dict=True)
	# Convert the classification report to a DataFrame
	report_df = pd.DataFrame(report).transpose()
	# prep data
	X, y = df_pca[req_cols], df_pca[[flag]]
	#X, y = df.drop(columns=[flag,identifier]), df[[flag]]
	# scale features
	# min_max_scaler = MinMaxScaler()
	# X_norm = min_max_scaler.fit_transform(X)
	#X_norm = (X - X.min()) / (X.max() - X.min())
	# run inference
	y_pred_proba = model.predict_proba(X)
	y_pred_df = pd.DataFrame(y_pred_proba)
	df_pca.insert(0, 'propensity_score', y_pred_df[1])
	# df_pca[identifier] = identifier_df
	# df_pca[identifier]=df_pca[identifier].astype('str')
	# Display classification report
	st.subheader("Classification Report")
	st.dataframe(report_df,width=600)

	# Display confusion matrix
	# st.subheader("Confusion Matrix")
	# st.write(df_cm,width=600)

	# Display confusion matrix
	st.subheader("Confusion matrix")
	st.plotly_chart(fig)
	return df_pca[['propensity_score']]



	# if 'df' in st.session_state:
	# task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
	# model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
	# flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
	# identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
	# st.sidebar.write("Applicable only for Regression model type")
	# dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
	# st.session_state.flag=flag
	# st.session_state.identifier=identifier
	# # Sidebar for user inputs
	# if flag is not None:
	# with st.expander("Model Configuration", expanded=True):
	# unique_flag_values = st.session_state.df[flag].unique()
	# for value in unique_flag_values:
	# st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
	# control_sample_size = st.text_input("Control Sample Size")

	# try:
	# # Try converting to an integer
	# control_sample_size = int(control_sample_size)

	# # Check if control_sample_size is within the valid range
	# flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
	# if control_sample_size < 0 or control_sample_size > flag_0_size:
	# st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")

	# except ValueError:
	# st.error("Please enter a valid integer for Control Sample Size.")


	# #st.write("Applicable only for Regression model type")
	# #if st.session_state.get("task_type","") == "regression":
	# #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
	# point_estimate_variable = st.text_input("Variable of interest")
	# st.session_state.point_estimate_variable=point_estimate_variable

	# if st.button("Run Modeling"):
	# result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)

	# st.session_state.modeling_df = result_df
	# st.session_state.treated_df=result_df[result_df['Y']==1]
	# st.session_state.non_treated_df=result_df[result_df['Y']==0]




	st.title("Algorithms")

	#st.subheader("Classification") # Added line
	#classification_option = st.radio("Classification", ["Classification"]) # Added line

	if 'classification_option' not in st.session_state:
	st.session_state.classification_option = "Classification"
	if 'algorithm_option' not in st.session_state:
	st.session_state.algorithm_option = "Logistic Regression"

	classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")

	if classification_option != st.session_state.classification_option:
	st.session_state.classification_option = classification_option

	if st.session_state.classification_option == "Classification":
	col1, col2 = st.columns(2)

	with col1:
	st.write("#####")
	lr_checkbox = st.checkbox(
	label="Logistic Regression",
	key="algorithm_lr_cb",
	value=(st.session_state.algorithm_option == "Logistic Regression")
	)

	with col2:
	st.write("#####")
	show_lr_options = st.checkbox(
	label="Change default options",
	key="lr_options_cb",
	disabled=not lr_checkbox,
	)

	cols = st.columns((2, 1))
	with cols[0]:
	lr_hyp_placeholder = st.empty()
	lr_model_placeholder = st.empty()

	solver='lbfgs'
	class_weights=None
	max_iter=1000
	if show_lr_options and lr_checkbox:
	with lr_hyp_placeholder:
	with st.expander("LR parameters"):
	solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
	max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
	class_weight_option = st.selectbox(
	'Select class weights option:',
	('Custom', 'Balanced')
	)

	if class_weight_option == 'Custom':
	weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
	weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
	class_weights = {1: weight_1, 0: weight_0}
	elif class_weight_option == 'Balanced':
	class_weights = {1: 0.5, 0: 0.5}
	#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))

	col1, col2 = st.columns(2)

	with col1:
	st.write("#####")
	xgb_checkbox = st.checkbox(
	label="Xgboost Classifier", key="algorithm_xgb_cb",
	value=(st.session_state.algorithm_option == "Xgboost Classifier")
	)

	with col2:
	st.write("#####")
	show_xgb_options = st.checkbox(
	label="Change default options",
	key="xgb_options_cb",
	disabled=not xgb_checkbox,
	)

	cols = st.columns((2, 1))
	with cols[0]:
	xgb_hyp_placeholder = st.empty()

	max_depth=None
	subsample=None
	eta=None

	if show_xgb_options and xgb_checkbox:
	with xgb_hyp_placeholder:
	with st.expander("XGB hyper parameters"):
	max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
	subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
	eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
	#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
	st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"

	elif classification_option == "Regression":
	col1, col2 = st.columns(2)

	with col1:
	st.write("#####")
	lr_checkbox = st.checkbox(
	label="Linear Regression",
	key="algorithm_lr_cb",
	value=(st.session_state.algorithm_option == "Linear Regression")
	)

	with col2:
	st.write("#####")
	show_lr_options = st.checkbox(
	label="Change default options",
	key="lr_options_cb",
	disabled=not lr_checkbox,
	)

	cols = st.columns((2, 1))
	with cols[0]:
	lr_hyp_placeholder = st.empty()
	lr_model_placeholder = st.empty()

	solver='lbfgs'
	class_weights=None
	max_iter=1000
	if show_lr_options and lr_checkbox:
	with lr_hyp_placeholder:
	with st.expander("LR parameters"):
	solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
	max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
	class_weight_option = st.selectbox(
	'Select class weights option:',
	('Custom', 'Balanced')
	)

	if class_weight_option == 'Custom':
	weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
	weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
	class_weights = {1: weight_1, 0: weight_0}
	elif class_weight_option == 'Balanced':
	class_weights = {1: 0.5, 0: 0.5}

	col1, col2 = st.columns(2)

	with col1:
	st.write("#####")
	xgb_checkbox = st.checkbox(
	label="Xgboost Regression", key="algorithm_xgb_cb",
	value=(st.session_state.algorithm_option == "Xgboost Regression")
	)

	with col2:
	st.write("#####")
	show_xgb_options = st.checkbox(
	label="Change default options",
	key="xgb_options_cb",
	disabled=not xgb_checkbox,
	)

	cols = st.columns((2, 1))
	with cols[0]:
	xgb_hyp_placeholder = st.empty()

	max_depth=None
	subsample=None
	eta=None

	if show_xgb_options and xgb_checkbox:
	with xgb_hyp_placeholder:
	with st.expander("XGB hyper parameters"):
	max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
	subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
	eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
	st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"

	with cols[0]:
	control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))

	#st.subheader("Classification") # Added line
	#classification_option = st.radio("Classification", ["Classification"]) # Added line

	if st.button("Run Modeling"):
	if lr_checkbox:
	st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
	elif xgb_checkbox:
	st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)


	# st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
	st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
	st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]