Source code for prediction of COVID-19 test results. This is supplemental material to publication
Wojtusiak J, Bagais W, Vang J, Guralnik E, Roess A, Alemi F, "The Role of Symptom Clusters in Triage of COVID-19 Patients," Quality Management in Health Care, 2022.
Source code by Wejdan Bagais and Jee Vang with contribution of other authors.
# import libraries
from models import select_attributes
from models import conjunction_columns
import pandas as pd
import numpy as np
import timeit
from patsy import dmatrix, dmatrices
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
start = timeit.default_timer()
#list of inverse of regularization
c_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.5,2]
split_ids = []
cs = []
source = []
AUCs = []
prec = []
rec = []
vars_cnt = []
vars_lists = []
ys_test = []
ys_pred = []
# loop over the 30 split data
for i in range (0,30):
# read the data
tr_path = "../data/30_splits_data/binary-transformed_tr_"+str(i)+".csv"
ts_path= "../data/30_splits_data/binary-transformed_ts_"+str(i)+".csv"
train = pd.read_csv(tr_path)
test = pd.read_csv(ts_path)
# create the single, pair, and triplets clusters
XT, Xt, yT, yt = conjunction_columns(train, test)
# loop over inverse of regularization
for c in c_list:
# run the model
auc, recall, precision, valid_cols, yt, y_pred = select_attributes(XT, yT, Xt, yt,c)
# save results to the list
split_ids.append(i)
cs.append(c)
source.append('conjunction')
AUCs.append(auc)
prec.append(precision)
rec.append(recall)
vars_lists.append(valid_cols)
vars_cnt.append(len(valid_cols))
ys_test.append(yt.values.tolist())
ys_pred.append(y_pred)
print(f'ID {i}, C={c:.2f}, AUC={auc:.5f}, Precision={precision:.5f}, Recall={recall:.5f}, cls# {len(valid_cols)}')
stop = timeit.default_timer()
print('Time: ', stop - start)
# identify the list of unique selected predictors
unq_var = vars_lists.copy()
for i in range(0, len(unq_var)):
unq_var[i] = [sub.replace(' & ', ',') for sub in unq_var[i]]
sympt_lists = []
for i in range(0, len(unq_var)):
l = ",".join(unq_var[i])
l2 = list(set(l.split(',')))
sympt_lists.append(l2)
ys_pred_l = []
for i in ys_pred:
ys_pred_l.append(list(i))
# create dataframe for all results
ff = pd.DataFrame({'split_ids' : split_ids,
'cs' : cs,
'source' : source,
'AUCs' : AUCs,
'prec' : prec,
'rec' : rec,
'vars_cnt' : vars_cnt,
'vars_lists' : vars_lists,
'y_test' : ys_test,
'y_pred' : ys_pred_l
})
# add list of unique predictors and its count
ff['sympt_lists'] = sympt_lists
ff['sympt_cnt'] = ff['sympt_lists'].apply(lambda x :len(x))
# save the results
ff.to_csv('../data/results/interaction_terms.csv', index = False)
table = pd.pivot_table(ff, values=['AUCs', 'vars_cnt', 'sympt_cnt']
, index=['cs']
, aggfunc=np.mean).round(decimals =4)
table['sympt_cnt'] = table['sympt_cnt'].round().astype(int)
table['vars_cnt'] = table['vars_cnt'].round().astype(int)
table
AUCs | sympt_cnt | vars_cnt | |
---|---|---|---|
cs | |||
0.1 | 0.7800 | 8 | 5 |
0.2 | 0.7863 | 12 | 9 |
0.3 | 0.8040 | 15 | 14 |
0.4 | 0.8143 | 17 | 18 |
0.5 | 0.8147 | 20 | 25 |
0.6 | 0.8144 | 23 | 32 |
0.7 | 0.8138 | 25 | 38 |
0.8 | 0.8144 | 27 | 45 |
0.9 | 0.8166 | 28 | 52 |
1.0 | 0.8170 | 29 | 58 |
1.5 | 0.8115 | 33 | 94 |
2.0 | 0.8071 | 35 | 130 |
column = table["AUCs"]
best_c = column.idxmax()
best_c
1.0
# C = 1 has the highest AUC;
# however, C = 0.4 has similar AUC values
# with much less number of selected variables
_C = 0.4
path = "../data/preprocessed.csv"
df = pd.read_csv(path)
X, Xt, y, yt = conjunction_columns(df, df)
symptoms = df.columns.tolist()
symptoms.remove('TestPositive')
symptom_interactions = ' + '.join(symptoms)
formula = f'TestPositive ~ ({symptom_interactions}) ** 3'
include_columns = symptoms + ['TestPositive']
y, X = dmatrices(formula, df[include_columns], return_type='dataframe')
X = X.drop(columns=['Intercept'])
X.columns = [s.replace('_',' ') for s in X.columns]
X.columns = X.columns.str.replace(":", " & ")
Xy_pickle = 'BinaryDataX_pairs_triples.p'
pickle.dump({'X': X, 'y': y}, open(Xy_pickle, 'wb'))
def do_validation(fold, tr, te, c= _C):
data = pickle.load(open(Xy_pickle, 'rb'))
X, y = data['X'], data['y']
X_tr, X_te = X.iloc[tr], X.iloc[te]
y_tr, y_te = y.iloc[tr].values.ravel(), y.iloc[te].values.ravel()
print(f'fold {fold:02}')
regressor = LogisticRegression(penalty='l1', solver='saga', C=c, n_jobs=-1, max_iter=5000*2)
regressor.fit(X_tr, y_tr)
y_pr = regressor.predict_proba(X_te)[:,1]
score = roc_auc_score(y_te, y_pr)
print(f'fold {fold:02}, score={score:.5f}')
return score, regressor.coef_[0]
skf = StratifiedKFold(n_splits=24, shuffle=True, random_state=37)
outputs = Parallel(n_jobs=-1)(delayed(do_validation)(fold, tr, te, _C)
for fold, (tr, te) in enumerate(skf.split(X, y)))
scores = pd.Series([score for score, _ in outputs])
coefs = pd.DataFrame([coef for _, coef in outputs], columns=X.columns)
def get_profile(df, col):
s = df[col]
s_pos = s[s > 0]
s_neg = s[s < 0]
n = df.shape[0]
p_pos = len(s_pos) / n
p_neg = len(s_neg) / n
return {
'field': col,
'n_pos': len(s_pos),
'n_neg': len(s_neg),
'pct_pos': p_pos,
'pct_neg': p_neg,
'is_valid': 1 if p_pos >= 0.95 or p_neg >= 0.95 else 0
}
valid_coefs = pd.DataFrame([get_profile(coefs, c) for c in coefs.columns]).sort_values(['is_valid'], ascending=False)
valid_coefs = valid_coefs[valid_coefs.is_valid == 1]
valid_cols = list(valid_coefs.field)
regressor = LogisticRegression(penalty='l1', solver='saga', C= _C, n_jobs=-1,
max_iter=5000*2, random_state=37)
regressor.fit(X[valid_cols], y.values.ravel())
LogisticRegression(C=0.4, max_iter=10000, n_jobs=-1, penalty='l1', random_state=37, solver='saga')
y_pred = regressor.predict_proba(X[valid_cols])[:,1]
t = X[valid_cols].copy()
t['y_pred'] = y_pred
t['y_actual'] = y
t.to_csv("../data/results/prediction_interaction_terms.csv", index=False)
c = pd.Series(regressor.coef_[0], valid_cols)
plt.style.use('ggplot')
i = pd.Series([regressor.intercept_[0]], index=['intercept'])
s = pd.concat([c[c > 0], c[c < 0]]).sort_index()
s = pd.concat([i, s])
color = ['r' if v > 0 else 'b' for v in s]
ax = s.plot(kind='bar', color=color, figsize=(20, 4))
_ = ax.set_title(f'Logistic Regression, validated auc={scores.mean():.5f}')
s_odds = np.exp(s)
color = ['r' if v > 1 else 'b' for v in s_odds]
ax = s_odds.plot(kind='bar', color=color, figsize=(20, 4))
_ = ax.set_title(f'Logistic Regression, coefficient odds')
pd.DataFrame({
'coefficient': s,
'coefficient_odds': s_odds
}).round(decimals=4)
coefficient | coefficient_odds | |
---|---|---|
intercept | -1.7669 | 0.1709 |
Cough & Age_18_to_29 & Female | 0.3608 | 1.4345 |
Cough & Fatigue & Non_Hispanic_or_Latino | -0.2603 | 0.7708 |
Cough & Fever & Runny_nose | 0.5409 | 1.7175 |
Cough & Headaches & Race_White | 0.6375 | 1.8917 |
Cough & Headaches & Runny_nose | 0.5911 | 1.8059 |
Cough & Loss_of_appetite & Race_White | 0.4745 | 1.6073 |
Cough & Loss_of_smell & Loss_of_taste | 0.9050 | 2.4718 |
Cough & Loss_of_taste & Fever | 0.3199 | 1.3769 |
Cough & Loss_of_taste & Runny_nose | 0.0550 | 1.0566 |
Fatigue & Chest_pain & Race_White | 1.5554 | 4.7370 |
Fever & Age_30_and_over | -0.4427 | 0.6423 |
Headaches & Chills & Race_White | 0.6258 | 1.8698 |
Headaches & Muscle_aches | 0.6852 | 1.9841 |
Headaches & Pinkeye & Non_Hispanic_or_Latino | 0.7138 | 2.0418 |
History_of_respiratory_symptoms & Cough & Muscle_aches | -0.6269 | 0.5342 |
History_of_respiratory_symptoms & Cough & Race_White | 0.5975 | 1.8175 |
History_of_respiratory_symptoms & Cough & Runny_nose | 0.1993 | 1.2206 |
History_of_respiratory_symptoms & Muscle_aches & Age_30_and_over | -0.8643 | 0.4213 |
Loss_of_taste & Headaches & Non_Hispanic_or_Latino | 0.3909 | 1.4783 |
Numbness & Non_Hispanic_or_Latino | -0.5733 | 0.5637 |
Runny_nose & Non_Hispanic_or_Latino | -0.9138 | 0.4010 |
Sore_throat & Chills & Female | -0.2131 | 0.8081 |
Sore_throat & Fever & Runny_nose | 0.4726 | 1.6041 |
(pd.DataFrame({
'coefficient': s,
'coefficient_odds': s_odds
}).round(decimals=4)).shape
(24, 2)