import pandas as pd


df = pd.read_csv(r"C:\Users\ayesh\Desktop\823\Expected LOS in 10 Diseases.csv")
df.shape

(6657, 11)


X = df.dropna()
X, y = X[[c for c in X.columns if c not in ['Cared for by Dr Smith', 'LOS']]], X['Cared for by Dr Smith']

X.shape, y.shape

((6657, 9), (6657,))


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=10_000)
model.fit(X, y)

model.intercept_[0], model.coef_[0]

(0.877320460781235,
 array([ 0.00411923, -0.0091996 , -0.00089115,  0.01316132,  0.01316132,
        -0.0143746 , -0.08469029, -0.52473351, -0.52473351]))


y_pred = model.predict_proba(X)[:,1]
y_pred.shape

(6657,)


Y = pd.DataFrame({'y_true': y, 'y_pred': y_pred})
Y_1 = Y[Y['y_true'] == 1][['y_pred']]
Y_0 = Y[Y['y_true'] == 0][['y_pred']]

Y.shape, Y_1.shape, Y_0.shape

((6657, 2), (4007, 1), (2650, 1))


from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors()
nn.fit(Y_1)

NearestNeighbors()


def get_y0_iloc(i):
    return Y_0.iloc[i:i+1].index[0]

def get_y1_iloc(i):
    return Y_1.iloc[i:i+1].index[0]

def get_y1_ilocs(indices):
    return [get_y1_iloc(i) for i in indices]
    
neighbors = nn.kneighbors(Y_0, return_distance=False)
seen = {}
pairs = []
for i, neighs in enumerate(neighbors):
    y_0 = get_y0_iloc(i)
    ilocs = [j for j in get_y1_ilocs(neighs) if j not in seen]
    if len(ilocs) > 0:
        y_1 = ilocs[0]
    else:
        y_1 = get_y1_ilocs(neighs)[0]
    tup = y_0, y_1
    pairs.append(tup)
    
pair_df = pd.DataFrame(pairs, columns=['y_0', 'y_1'])
pair_df.shape

(2650, 2)


X.iloc[pair_df['y_0']].mean()

Hypertension             0.500000
Anemia                   0.499623
Diabetes                 0.499623
HIV                      0.498868
Stomach Cancer           0.498868
Lung Cancer              0.495094
Myocardial Infarction    0.492830
Heart Failure            0.483019
Metastetic Cancer        0.483019
dtype: float64


X.iloc[pair_df['y_1']].mean()

Hypertension             0.500000
Anemia                   0.499623
Diabetes                 0.499623
HIV                      0.503019
Stomach Cancer           0.494717
Lung Cancer              0.495094
Myocardial Infarction    0.492830
Heart Failure            0.486792
Metastetic Cancer        0.479245
dtype: float64


import matplotlib.pyplot as plt

plt.style.use('ggplot')

_ = pd.DataFrame({
    'other': X.iloc[pair_df['y_0']].mean(),
    'dr_smith': X.iloc[pair_df['y_1']].mean()
}).plot(kind='bar', figsize=(12, 4), title='Mean of Diagnoses, Dr. Smith vs Other')


df.iloc[pair_df['y_0']]['LOS'].mean(), df.iloc[pair_df['y_1']]['LOS'].mean()

(3.960754716981132, 3.992830188679245)


df.iloc[pair_df['y_0']]['LOS'].mean(), df.iloc[pair_df['y_1']]['LOS'].mean()

(3.960754716981132, 3.992830188679245)


df.iloc[pair_df['y_1']]['LOS'].mean() - df.iloc[pair_df['y_0']]['LOS'].mean()

0.03207547169811331


pair_df['y_0_LOS'] = pair_df['y_0'].apply(lambda i: df.iloc[i]['LOS'])
pair_df['y_1_LOS'] = pair_df['y_1'].apply(lambda i: df.iloc[i]['LOS'])
pair_df['diff'] = pair_df['y_1_LOS'] - pair_df['y_0_LOS']

pair_df


pair_df['diff'].mean()

0.03207547169811321

	y_0	y_1	y_0_LOS	y_1_LOS	diff
0	0	5406	3	4	1
1	1	5407	4	7	3
2	4	5408	3	2	-1
3	5	2055	6	6	0
4	8	1034	6	1	-5
...	...	...	...	...	...
2645	5289	6103	3	4	1
2646	5292	6653	6	2	-4
2647	5293	6646	5	2	-3
2648	5296	6228	5	1	-4
2649	5297	6656	3	4	1

Propensity Scoring¶

Question 1: The following data provide the length of stay of patients seen by Dr. Smith (Variable Dr Smith=1) and his peer group (variable Dr. Smith = 0). Answer following questions:¶