import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm
### ANALYZE USING TUKEY
hr_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_HR_data.csv")
cal_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_calories_data.csv")
steps_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_steps_data.csv")
cal_data.head()
steps_data.head()
#filter to only columns we need
hr_data = pd.DataFrame(hr_data, columns=['creationdate','value'])
cal_data = pd.DataFrame(cal_data, columns=['creationdate','value'])
steps_data = pd.DataFrame(steps_data, columns=['creationdate','value'])
#convert types
hr_data['value'].astype('float')
cal_data['value'].astype('float')
steps_data['value'].astype('float')
hr_data['creationdate']=pd.to_datetime(hr_data['creationdate'])
cal_data['creationdate']=pd.to_datetime(cal_data['creationdate'])
steps_data['creationdate']=pd.to_datetime(steps_data['creationdate'])
#reduce timeframe of analysis to 6/2021-12/2021 data only
split_date = datetime.datetime(2021, 6, 1)
hr_data = hr_data.loc[hr_data['creationdate'] > split_date]
cal_data = cal_data.loc[cal_data['creationdate'] > split_date]
steps_data = steps_data.loc[steps_data['creationdate'] > split_date]
hr_data['creationdate'] = hr_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
cal_data['creationdate'] = cal_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
steps_data['creationdate'] = steps_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
hr_data.head()
#remove nearly impossible HR values
hr_data=hr_data[hr_data.value < 220] #cannot exceed max heart rate
hr_data=hr_data[hr_data.value > 38] #cannot be bradycardic
#group hr data by the hour
hrmean = hr_data.groupby('creationdate').mean()
hrmean
#group calorie data by the hour using sum energy expenditure
calsum = cal_data.groupby('creationdate').sum()
calsum
#remove nearly impossible calorie expenditure (average person burns 45 calories per hour, at rest)
calsum=calsum[calsum.value > 30]
calsum
#group steps data by the hour using sum steps
stepssum = steps_data.groupby('creationdate').sum()
stepssum
#merge data into single df with 1 observation per time period
calhrsteps_merged = hrmean.merge(calsum, how='inner', on='creationdate').merge(stepssum, how='inner', on='creationdate')
calhrsteps_merged
#rename merged columns
calhrsteps_merged = calhrsteps_merged.rename(columns = {'value_x': 'HR', 'value_y':'Cal', 'value':'Steps'})
calhrsteps_merged
calhrsteps_merged.reset_index(inplace=True)
calhrsteps_merged.head(30)
#creationdate back to datetime object for plotting
calhrsteps_merged1 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR'])
calhrsteps_merged1['creationdate']=pd.to_datetime(calhrsteps_merged1['creationdate'])
#plot the time series
plt.figure(figsize=(40,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged1, marker='s', markerfacecolor = 'blue', linewidth=2, color='blue')
plt.legend()
plt.title('HR during 2021')
plt.xlabel('Date')
plt.ylabel('HR')
plt.show()
#regress HR on steps and calorie consumption
#I will use 10/1 as the cutoff for splitting into train and test data
split_date = '2021-10-01, 00h'
df_training = calhrsteps_merged[calhrsteps_merged['creationdate'] < split_date]
df_test = calhrsteps_merged.loc[calhrsteps_merged['creationdate'] >= split_date]
print(df_test)
print(df_training.shape)
print(df_test.shape)
#use sklearn for regression and calculating expected HR
x_train = df_training[['Cal', 'Steps']]
y_train = df_training['HR']
#train model
model = LinearRegression().fit(x_train,y_train)
#determine intercept and coefficient values
print('Intercept: ', model.intercept_)
print('Coefficients: ', model.coef_)
r_sq = model.score(x_train,y_train)
print(f"coefficient of determination: {r_sq}")
#examine regression model built on training data
x_train = sm.add_constant(x_train)
result = sm.OLS(y_train, x_train).fit()
print(result.summary())
#run model on test data
x_test = df_test[['Cal', 'Steps']]
y_test = df_test['HR']
y_pred = model.predict(x_test)
#evaluate regression model run on test data
print(f"MAE: {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {metrics.mean_squared_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")
#regress heart rate on steps and calorie consumption to find expected HR
x = calhrsteps_merged[['Cal', 'Steps']]
y = calhrsteps_merged['HR']
y_exp = model.predict(x)
calhrsteps_merged['ExpHR'] = y_exp
calhrsteps_merged
#create histogram
#check for normal distribution
calhrsteps_merged[["HR"]].plot.hist()
#report the r square of the regression
r_sq = model.score(x,y)
print(f"coefficient of determination: {r_sq}")
### TUKEY
#sort by expected HR
calhrsteps_merged.sort_values(by=['ExpHR']).reset_index(drop=True)
calhrsteps_merged.describe()
#fourth spreads
calhrsteps_merged['lower 4th']= 70.445414
calhrsteps_merged['upper 4th']= 82.726656
calhrsteps_merged['Fourth_spreads']=calhrsteps_merged['upper 4th']-calhrsteps_merged['lower 4th']
calhrsteps_merged
#calculate ucl and lcl
calhrsteps_merged['UCL']=calhrsteps_merged['upper 4th']+1.5*calhrsteps_merged['Fourth_spreads']
calhrsteps_merged['LCL']=calhrsteps_merged['upper 4th']-1.5*calhrsteps_merged['Fourth_spreads']
calhrsteps_merged
calhrsteps_merged2 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR','UCL','LCL'])
calhrsteps_merged2['creationdate']=pd.to_datetime(calhrsteps_merged2['creationdate'])
plt.figure(figsize=(20,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged2, marker='s', color='blue',linewidth=2)
plt.plot('creationdate','UCL', data=calhrsteps_merged2, color='red',linewidth=2)
plt.plot('creationdate','LCL', data=calhrsteps_merged2, color='red',linewidth=2)
plt.legend()
plt.title('Tukey - All Data')
plt.xlabel('Date')
plt.ylabel('Heart Rate (beats per min)')
plt.show()
#plot 2 days worth of data only
calhrsteps_merged3 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR','UCL','LCL'])
calhrsteps_merged3['creationdate']=pd.to_datetime(calhrsteps_merged3['creationdate'])
split_date = datetime.datetime(2021, 11, 18)
calhrsteps_merged3 = calhrsteps_merged3.loc[calhrsteps_merged3['creationdate'] > split_date]
calhrsteps_merged3
plt.figure(figsize=(20,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged3, marker='s', color='blue',linewidth=2)
plt.plot('creationdate','UCL', data=calhrsteps_merged3, color='red',linewidth=2)
plt.plot('creationdate','LCL', data=calhrsteps_merged3, color='red',linewidth=2)
plt.legend()
plt.title('Tukey - HR from 11/18/21-11/19/21')
plt.xlabel('Date')
plt.ylabel('Heart Rate (beats per min)')
plt.show()