# import library
import pandas as pd
import numpy as np
import glob
import warnings
import math
from patsy import dmatrix, dmatrices
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# import data
warnings.filterwarnings("ignore")
data = pd.DataFrame()
for f in sorted(glob.glob('data/original*.csv'), reverse=True):
dd = pd.read_csv(f,low_memory=False)
data=dd.append(data,ignore_index=True)
# Numbere of rows and columns
data.shape
(603, 84)
# Select columns: Demographics, symptoms, history, and test results
data = data[data['TestResult'].isin(['Yes, and I tested negative','Yes, and I tested positive'])][['TestResult',
'symptomsfirst','symptoms','symptomsresp','symptomsgastro','symptomsneuro'
,'symptomsinflamm'
,'age'
,'Ethnicity'
,'Race', 'gender'
,'symptomsconsresp', 'symptomsconsgastro', 'symptomsconsneuro'
,'symptomsconsinflamm'
]]
# convert the test result into numarical values (1: positive, 0: negative)
data['TestPositive'] = data['TestResult'].map({'Yes, and I tested positive': 1, 'Yes, and I tested negative': 0})
# Numbere of rows and columns
data.shape
(461, 16)
# convert history columns to numarical values (1: yes, 0: no or not sure)
data['IgnoreRespSymp'] = data['symptomsconsresp'].map( {'Yes': 1, 'No': 0, 'Not sure' : 0, np.nan: 0})
data['IgnoreGastroSymp'] = data['symptomsconsgastro'].map( {'Yes': 1, 'No': 0, 'Not sure' : 0, np.nan: 0})
data['IgnoreNeuroSymp'] = data['symptomsconsneuro'].map( {'Yes': 1, 'No': 0, 'Not sure' : 0, np.nan: 0})
data['IgnoreInflamSymp'] = data['symptomsconsinflamm'].map({'Yes': 1, 'No': 0, 'Not sure' : 0, np.nan: 0})
# compind all symptoms
data['symptoms_concat'] = data[['symptoms','symptomsresp','symptomsgastro','symptomsneuro',
'symptomsinflamm']].astype(str).apply(','.join, axis=1)
data = data.fillna('NaN')
# select columns
data = data[['TestPositive',
'symptomsfirst','symptoms_concat'
, 'age'
,'Ethnicity'
, 'Race', 'gender'
, 'IgnoreRespSymp', 'IgnoreGastroSymp', 'IgnoreNeuroSymp', 'IgnoreInflamSymp'
]]
# Identify the list of symptom from "symptoms_concat" column
symptoms = list(set((','.join(data['symptoms_concat'].values.tolist())).split(",")))
symptoms = [sub.split(" (")[0] for sub in symptoms]
symptoms.remove('None')
symptoms.remove('None or I did not get tested')
symptoms.remove('nan')
len(symptoms)
29
# binarizing symptoms
for s in symptoms:
data[s.lower()] = pd.np.where(data.symptoms_concat.str.contains(s), 1,0)
data.columns
Index(['TestPositive', 'symptomsfirst', 'symptoms_concat', 'age', 'Ethnicity', 'Race', 'gender', 'IgnoreRespSymp', 'IgnoreGastroSymp', 'IgnoreNeuroSymp', 'IgnoreInflamSymp', 'sore throat', 'cough', 'loss of smell', 'excessive sweating', 'joint or any other unexplained pain', 'nausea or vomiting', 'change in or loss of appetite', 'red or purple rash or lesions on your toes', 'slurred speech', 'loss of taste', 'unexplained rashes anywhere else', 'fever or feeling feverish', 'fatigue', 'shortness of breath', 'loss of balance', 'headaches', 'new confusion', 'pinkeye or conjunctivitis', 'wheezing', 'runny nose', 'chills', 'any tingling/numbness/swelling in hands or feet', 'unusual shivering or shaking', 'diarrhea', 'stomach or abdominal pain', 'bluish lips or face', 'chest pain', 'muscle aches', 'difficulty breathing'], dtype='object')
data.shape
(461, 40)
# Convert age into 2 categories
data['Age 30 and over'] = np.where(data['age']>= 30, 1, 0)
data['Age 18 to 29'] = np.where(data['age']< 30, 1, 0)
data = data.drop(columns=[ 'age'])
data['Race'].value_counts()
White 364 Black or African American 59 Asian 21 American Indian or Alaska Native 5 Native Hawaiian or Other Pacific Islander 4 Other, please specify: 3 White,Black or African American 2 White,Black or African American,American Indian or Alaska Native 1 White,Other, please specify: 1 White,Asian,Native Hawaiian or Other Pacific Islander 1 Name: Race, dtype: int64
# Get dummies for arace, eethnicity, and gender
data = pd.get_dummies(data, columns=['Race', 'Ethnicity', 'gender'])
data = data.drop(columns=['Race_Other, please specify:', 'Ethnicity_Unknown', 'gender_Other',
'Race_White,Other, please specify:'])
# select list of columns
cols = list(data.columns)
cols = [e for e in cols if e not in ('symptoms_concat', 'symptomsfirst', 'TestPositive')]
cols
['IgnoreRespSymp', 'IgnoreGastroSymp', 'IgnoreNeuroSymp', 'IgnoreInflamSymp', 'sore throat', 'cough', 'loss of smell', 'excessive sweating', 'joint or any other unexplained pain', 'nausea or vomiting', 'change in or loss of appetite', 'red or purple rash or lesions on your toes', 'slurred speech', 'loss of taste', 'unexplained rashes anywhere else', 'fever or feeling feverish', 'fatigue', 'shortness of breath', 'loss of balance', 'headaches', 'new confusion', 'pinkeye or conjunctivitis', 'wheezing', 'runny nose', 'chills', 'any tingling/numbness/swelling in hands or feet', 'unusual shivering or shaking', 'diarrhea', 'stomach or abdominal pain', 'bluish lips or face', 'chest pain', 'muscle aches', 'difficulty breathing', 'Age 30 and over', 'Age 18 to 29', 'Race_American Indian or Alaska Native', 'Race_Asian', 'Race_Black or African American', 'Race_Native Hawaiian or Other Pacific Islander', 'Race_White', 'Race_White,Asian,Native Hawaiian or Other Pacific Islander', 'Race_White,Black or African American', 'Race_White,Black or African American,American Indian or Alaska Native', 'Ethnicity_Hispanic Latino', 'Ethnicity_Non-Hispanic Latino', 'gender_Female', 'gender_Male']
len(cols)
47
for i in range(len(symptoms)):
symptoms[i] = symptoms[i].lower()
data = data.drop(columns=['symptomsfirst', 'symptoms_concat'])
data.columns = data.columns.str.replace("gender_Female", "Female")
data.columns = data.columns.str.replace("gender_Male", "Male")
data.columns = data.columns.str.replace("Ethnicity_Hispanic Latino", "Hispanic or Latino")
data.columns = data.columns.str.replace("Ethnicity_Non-Hispanic Latino", "Non Hispanic or Latino")
data.columns = data.columns.str.replace("Race_White", "Race White")
data.columns = data.columns.str.replace("Race_Black or African American", "Race Black or African American")
data.columns = data.columns.str.replace("Race_Native Hawaiian or Other Pacific Islander", "Race Native Hawaiian or Other Pacific Islander")
data.columns = data.columns.str.replace("Race_American Indian or Alaska Native", "Race American Indian or Alaska Native")
data.columns = data.columns.str.replace("Race_Asian", "Race Asian")
data.columns = data.columns.str.replace("Race White,Black or African American,American Indian or Alaska Native", "Mixed Race 1")
data.columns = data.columns.str.replace("Race White,Asian,Native Hawaiian or Other Pacific Islander", "Mixed Race 2")
data.columns = data.columns.str.replace("Race White,Black or African American", "Mixed Race 3")
data.columns = data.columns.str.replace("stomach or abdominal pain", "Abdominal pain")
data.columns = data.columns.str.replace("chest pain", "Chest pain")
data.columns = data.columns.str.replace("chills", "Chills")
data.columns = data.columns.str.replace("new confusion", "Confusion")
data.columns = data.columns.str.replace("cough", "Cough")
data.columns = data.columns.str.replace("diarrhea", "Diarrhea")
data.columns = data.columns.str.replace("excessive sweating", "Excessive sweating")
data.columns = data.columns.str.replace("fatigue", "Fatigue")
data.columns = data.columns.str.replace("fever or feeling feverish", "Fever")
data.columns = data.columns.str.replace("headaches", "Headaches")
data.columns = data.columns.str.replace("joint or any other unexplained pain", "Joint pain")
data.columns = data.columns.str.replace("change in or loss of appetite", "Loss of appetite")
data.columns = data.columns.str.replace("loss of balance", "Loss of balance")
data.columns = data.columns.str.replace("loss of smell", "Loss of smell")
data.columns = data.columns.str.replace("loss of taste", "Loss of taste")
data.columns = data.columns.str.replace("muscle aches", "Muscle aches")
data.columns = data.columns.str.replace("pinkeye or conjunctivitis", "Pinkeye")
data.columns = data.columns.str.replace("red or purple rash or lesions on your toes", "Red rash")
data.columns = data.columns.str.replace("runny nose", "Runny nose")
data.columns = data.columns.str.replace('unusual shivering or shaking', "Shivering")
data.columns = data.columns.str.replace("shortness of breath", "Shortness of breath")
data.columns = data.columns.str.replace('difficulty breathing', 'Difficulty breathing')
data.columns = data.columns.str.replace("sore throat", "Sore throat")
data.columns = data.columns.str.replace("unexplained rashes anywhere else", "Unexplained rash")
data.columns = data.columns.str.replace("nausea or vomiting", "Vomiting")
data.columns = data.columns.str.replace("wheezing", "Wheezing")
data.columns = data.columns.str.replace('bluish lips or face', 'Bluish lips or face')
data.columns = data.columns.str.replace('any tingling/numbness/swelling in hands or feet', 'Numbness')
data.columns = data.columns.str.replace('slurred speech', 'Slurred speech')
data.columns = data.columns.str.replace("IgnoreNeuroSymp", "History of neurological symptoms")
data.columns = data.columns.str.replace("IgnoreGastroSymp", "History of gastrointestinal symptoms")
data.columns = data.columns.str.replace("IgnoreRespSymp", "History of respiratory symptoms")
data.columns = data.columns.str.replace("IgnoreInflamSymp", "History of inflammatory symptoms")
data.columns = [s.replace(' ','_') for s in data.columns]
data.columns.tolist()
['TestPositive', 'History_of_respiratory_symptoms', 'History_of_gastrointestinal_symptoms', 'History_of_neurological_symptoms', 'History_of_inflammatory_symptoms', 'Sore_throat', 'Cough', 'Loss_of_smell', 'Excessive_sweating', 'Joint_pain', 'Vomiting', 'Loss_of_appetite', 'Red_rash', 'Slurred_speech', 'Loss_of_taste', 'Unexplained_rash', 'Fever', 'Fatigue', 'Shortness_of_breath', 'Loss_of_balance', 'Headaches', 'Confusion', 'Pinkeye', 'Wheezing', 'Runny_nose', 'Chills', 'Numbness', 'Shivering', 'Diarrhea', 'Abdominal_pain', 'Bluish_lips_or_face', 'Chest_pain', 'Muscle_aches', 'Difficulty_breathing', 'Age_30_and_over', 'Age_18_to_29', 'Race_American_Indian_or_Alaska_Native', 'Race_Asian', 'Race_Black_or_African_American', 'Race_Native_Hawaiian_or_Other_Pacific_Islander', 'Race_White', 'Mixed_Race_2', 'Mixed_Race_3', 'Mixed_Race_1', 'Hispanic_or_Latino', 'Non_Hispanic_or_Latino', 'Female', 'Male']
data.shape
(461, 48)
data = data.reset_index()
data = data.drop(columns=['index'])
data.to_csv('data/preprocessed.csv',index=False)
# read list of index for eaach data (30 list of ids for training and 30 list of ids for testing)
ids = pd.read_csv("data/30_splits_ids.csv")
# save the 30 training and testing data
for i in ids.columns:
index = [x for x in ids[i].values.tolist() if math.isnan(x) == False]
d = data.loc[index]
d.to_csv('data/30_splits_data/'+str(i), index=False)