####################################################################### # Code Written by: Chelsea Zabowski # HAP 823 ; Spring 2020 ####################################################################### #### Set up environment ------------------------------------------------ # Find version of R installed on machine print(version) # R version 3.6.1 (2019-07-05) # Install RTools # https://cran.rstudio.com/bin/windows/Rtools/ # RTools35.exe #### Install + Load R Packages from CRAN #### options(scipen=999) # gets rid of the scientific notation # function to install packages if not yet installed on your machine fun.usePackage<-function(p){ # load a package if installed, else load after installation. # Args: # p: package name in quotes if (!is.element(p, installed.packages()[,1])){ print(paste("Package:",p,"Not found, Installing Now...")) install.packages(p, dep = TRUE)} print(paste("Loading Package :",p)) require(p, character.only = TRUE) } fun.usePackage('DataExplorer') # descriptive analysis fun.usePackage('tidyverse') # syntax transformation, data manipulation, graphing fun.usePackage('data.table') # fast i/o fun.usePackage('readxl') # read and write excel files fun.usePackage('gdata') # data manimpulation fun.usePackage('writexl') # read and write excel files fun.usePackage('openxlsx') # read and write excel files fun.usePackage('sqldf') # SQL wrapper for R fun.usePackage('fastDummies') # quick binerization # do some memory cleanup. gc(verbose = TRUE, reset = TRUE) format(memory.size(), units = "MB") rm(list = ls(all.names = TRUE)) ####################################################################### # Question 4: # Read the data, making sure all entries are numbers. Calculate age at each assessment not just at first assessment. # Clean the data, removing impossible situations (remove cases with date of assessment after death). # Remove irrelevant cases (all cases that have only one assessment) # For each assessment, remove all assessments that are more than 6 months older. # Organize age at current admission into a binary variable above or below the average age at current assessment. # Calculate a new variable for each assessment that checks if the person would have an eating in the next 6 months. This requires you to join the data for each person with itself (excluding all assessments prior or including current assessment). # Group the data based on current disabilities, gender, and age. Count the number of residents who died within 6 months of assessment for combination of disabilities, gender and age. To do this, first assess the number of days from first assessment for the death. Then examine if the assessment time is within 180 days of day of death. # Use ordinary regression to regress the logit of odds of dying on various current disabilities, age, gender, and pair wise interactions of these variables. # Identify what is the Markov Blanket of feeding disability in 6 months. ####################################################################### # Question 5: # Repeat question 4 but now predict 6 month likelihood of first occurence of walking disorders instead of death. # In this analysis, exclude all assessments that occur after walking disability has occurred. ####################################################################### # Import data Assessments <- read_table2("Week2_Logistic_Regression/Assessments.txt", col_names = FALSE) # Create column names Assessments_colnames<-(c("ID", "Age", "Gender", "Num_Assessment", "Days_Followed", "First_Assess", "Last_Assess", "Unable_Eat", "Unable_Transfer", "Unable_Groom", "Unable_Toilet", "Unable_Bathe", "Unable_Walk", "Unable_Dress", "Unable_Bowel", "Unable_Urine", "Survival", "Assessment_Num")) # Set column names colnames(Assessments) <- Assessments_colnames # 1) Remove observations where age is negative # 2) Remove irrelevant cases (all cases that have only one assessment) # 4) Calculate age at each assessment not just at first assessment. # 5) Binerize the gender Assessments_Clean <- Assessments %>% filter(Age > 0, # Remove people that have a negative age (There's 55 records that have neg age) Num_Assessment > 1) %>% mutate(Age_At_Assessment = Age + (First_Assess/365)) %>% mutate(Male = if_else(Gender == "M", 1,0)) %>% select(-Gender, -Survival) # 6) Calculate 6 months from their appointment day FirstAssess_6mOut <- Assessments_Clean %>% mutate(FirstAsses_6mOut = First_Assess + 180) # Calculate appt day + 6 months # 7) Find the first time patient was unable to walk First_Unable_Walk <- FirstAssess_6mOut %>% filter(Unable_Walk == 1) %>% group_by(ID) %>% summarise(FirstUnableWalk = min(First_Assess)) # 167809 obs # 8) Create flag for patient if unable walk 6months from appointment day # 9) Exclude all assessments that occur after walking disability has occured (per hw question) Assessments_w_UnableWalk_6m_Flag <- FirstAssess_6mOut %>% left_join(., y = First_Unable_Walk, by = "ID") %>% mutate(Unable_Walk_6m_Flag = if_else(FirstAsses_6mOut < FirstUnableWalk,0,1)) %>% mutate(Unable_Walk_6m_Flag = if_else(is.na(Unable_Walk_6m_Flag),0,Unable_Walk_6m_Flag)) %>% # Nulls from people that were always able to walk filter(First_Assess < FirstUnableWalk) # Exclude assessments that occur after walking disability # 10) Create decade variable from Age at Assessment # 11) Distinct data set by unique combination of independent and dependent variables Assessments_w_UnableWalk_6m_Flag_Distinct <- Assessments_w_UnableWalk_6m_Flag %>% mutate(Decade = floor(Age_At_Assessment/10)*10) %>% select(c(ID, Decade, Male, Unable_Eat, Unable_Transfer, Unable_Groom, Unable_Toilet, Unable_Bathe, Unable_Walk, Unable_Dress, Unable_Bowel, Unable_Urine, Unable_Walk_6m_Flag)) %>% distinct() # 12) Group the data based on current disabilities, gender, and age.(Group by the independent variables) # 13) Count the number of patients who were unable to walk within 6 months following their assessment given their combination of diagnoses and demographics # 14) Calculate probability of demographic/diagnoses group being unable to walk after 6 months # 15) Drop demographic/diagnoses groups with less than 10 people # Group data by independent variables Prob <- Assessments_w_UnableWalk_6m_Flag_Distinct %>% group_by(Decade, Male, Unable_Eat, Unable_Transfer, Unable_Groom, Unable_Toilet, Unable_Bathe, Unable_Dress, Unable_Bowel, Unable_Urine) %>% summarise(Prob = sum(Unable_Walk_6m_Flag)/n_distinct(ID), n = n_distinct(ID)) %>% filter(n > 9) %>% arrange(Prob) # 16) Use ordinary regression to regress the logit of odds of walking diability in 6 months on # various current disabilities, age, gender, and pair wise interactions of these variables # Methodology from Alemi video "Logistic Regression using ordinary regression" # https://www.youtube.com/watch?v=yXwsQ-ZFLzQ&feature=youtu.be # Logit function Logit_For_Regression <- Prob %>% mutate(Logit = if_else(Prob == 0, # if Prob = 0 (log(1/n)), # then if_else(Prob == 1, # if else Prob = 1 (log(n/(n+1))), # then (log(Prob/(1-Prob)))))) # else Prob = anything besides 1 or 0 # 17) Binerize the decade column # 18) Remove variables which should not go into the regerssion equation (Old decade, Probability (Prob), Number of Patients (n)) # 19) Reordered columns so "Logit" is the last column Final_Data_For_Regression <- Logit_For_Regression %>% dummy_cols(select_columns = "Decade") %>% select(-c(`Decade`,`Prob`,`n`)) %>% select(Male, contains("Decade"), contains("Unable"), Logit) # 20) Run model against all or some of independent variables using the glm (Generalized Linear Model) function # Logit variable what you're trying to predict Sixmonth_UnableWalk_Model_1 <- glm(Logit ~ Unable_Eat + Unable_Bowel + Unable_Urine , data = Final_Data_For_Regression) summary(Sixmonth_UnableWalk_Model_1) # Model equation = 0.0001874 + (0.5411226 * Unable_Eat) + (0.2903398 * Unable_Bowel) + (-0.1239866 * Unable_Urine) # 21) Run regression using all independent variables # Logit variable what you're trying to predict # Model run on ALL Independent variables Sixmonth_UnableWalk_Model_2 <- glm(Logit ~ . , data = Final_Data_For_Regression) summary(Sixmonth_UnableWalk_Model_2) # View confidence interval confint(Sixmonth_UnableWalk_Model_2)