In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm
In [2]:
### ANALYZE USING TUKEY
In [3]:
hr_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_HR_data.csv")
cal_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_calories_data.csv")
steps_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_steps_data.csv")
C:\Users\pktra\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3457: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
In [4]:
cal_data.head()
Out[4]:
unit creationdate startdate enddate value Unnamed: 5 Unnamed: 6
0 Cal 9/9/2018 10:54 9/9/2018 10:39 9/9/2018 10:54 18.258 NaN NaN
1 Cal 9/9/2018 11:09 9/9/2018 10:54 9/9/2018 11:08 17.214 NaN NaN
2 Cal 9/9/2018 11:24 9/9/2018 11:08 9/9/2018 11:24 18.665 NaN NaN
3 Cal 9/9/2018 11:39 9/9/2018 11:24 9/9/2018 11:39 18.673 NaN NaN
4 Cal 9/9/2018 11:54 9/9/2018 11:39 9/9/2018 11:53 17.429 NaN NaN
In [5]:
steps_data.head()
Out[5]:
unit creationdate startdate enddate value
0 count 9/9/2018 12:24 9/9/2018 12:13 9/9/2018 12:23 85.0
1 count 9/9/2018 12:30 9/9/2018 12:28 9/9/2018 12:28 10.0
2 count 9/9/2018 13:02 9/9/2018 12:46 9/9/2018 12:56 154.0
3 count 9/9/2018 13:09 9/9/2018 12:56 9/9/2018 13:04 371.0
4 count 9/9/2018 13:22 9/9/2018 13:04 9/9/2018 13:14 95.0
In [6]:
#filter to only columns we need 
hr_data = pd.DataFrame(hr_data, columns=['creationdate','value'])
cal_data = pd.DataFrame(cal_data, columns=['creationdate','value'])
steps_data = pd.DataFrame(steps_data, columns=['creationdate','value'])
In [7]:
#convert types
hr_data['value'].astype('float')
cal_data['value'].astype('float')
steps_data['value'].astype('float')
Out[7]:
0          85.0
1          10.0
2         154.0
3         371.0
4          95.0
          ...  
188331    235.0
188332    204.0
188333     10.0
188334     58.0
188335      8.0
Name: value, Length: 188336, dtype: float64
In [8]:
hr_data['creationdate']=pd.to_datetime(hr_data['creationdate'])
cal_data['creationdate']=pd.to_datetime(cal_data['creationdate'])
steps_data['creationdate']=pd.to_datetime(steps_data['creationdate'])

#reduce timeframe of analysis to 6/2021-12/2021 data only
split_date = datetime.datetime(2021, 6, 1)
hr_data = hr_data.loc[hr_data['creationdate'] > split_date]
cal_data = cal_data.loc[cal_data['creationdate'] > split_date]
steps_data = steps_data.loc[steps_data['creationdate'] > split_date]

hr_data['creationdate'] = hr_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
cal_data['creationdate'] = cal_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
steps_data['creationdate'] = steps_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
In [9]:
hr_data.head()
Out[9]:
creationdate value
161716 2021-06-01, 00h 52.0
161717 2021-06-01, 00h 54.0
161718 2021-06-01, 00h 55.0
161719 2021-06-01, 00h 54.0
161720 2021-06-01, 00h 55.0
In [10]:
#remove nearly impossible HR values
hr_data=hr_data[hr_data.value < 220] #cannot exceed max heart rate
hr_data=hr_data[hr_data.value > 38] #cannot be bradycardic

#group hr data by the hour 
hrmean = hr_data.groupby('creationdate').mean()
hrmean
Out[10]:
value
creationdate
2021-06-01, 00h 54.619877
2021-06-01, 01h 53.785714
2021-06-01, 02h 52.027908
2021-06-01, 03h 53.428571
2021-06-01, 04h 52.408371
... ...
2021-11-19, 12h 71.692308
2021-11-19, 13h 74.600000
2021-11-19, 14h 77.500000
2021-11-19, 15h 73.545455
2021-11-19, 16h 70.571429

2217 rows × 1 columns

In [11]:
#group calorie data by the hour using sum energy expenditure
calsum = cal_data.groupby('creationdate').sum()
calsum

#remove nearly impossible calorie expenditure (average person burns 45 calories per hour, at rest)
calsum=calsum[calsum.value > 30]
calsum
Out[11]:
value
creationdate
2021-06-01, 00h 76.537
2021-06-01, 01h 61.654
2021-06-01, 02h 79.292
2021-06-01, 03h 72.502
2021-06-01, 04h 70.318
... ...
2021-11-19, 12h 71.079
2021-11-19, 13h 119.868
2021-11-19, 14h 77.796
2021-11-19, 15h 96.982
2021-11-19, 16h 63.335

3181 rows × 1 columns

In [12]:
#group steps data by the hour using sum steps
stepssum = steps_data.groupby('creationdate').sum()
stepssum
Out[12]:
value
creationdate
2021-06-01, 00h 8.0
2021-06-01, 05h 3.0
2021-06-01, 06h 774.0
2021-06-01, 07h 1021.0
2021-06-01, 08h 1799.0
... ...
2021-11-19, 14h 1645.0
2021-11-19, 15h 555.0
2021-11-19, 16h 1460.0
2021-11-19, 17h 68.0
2021-11-19, 19h 8.0

2444 rows × 1 columns

In [13]:
#merge data into single df with 1 observation per time period
calhrsteps_merged = hrmean.merge(calsum, how='inner', on='creationdate').merge(stepssum, how='inner', on='creationdate')
calhrsteps_merged
Out[13]:
value_x value_y value
creationdate
2021-06-01, 00h 54.619877 76.537 8.0
2021-06-01, 05h 53.461538 72.339 3.0
2021-06-01, 06h 69.000000 96.094 774.0
2021-06-01, 07h 75.500000 93.786 1021.0
2021-06-01, 08h 86.625000 98.991 1799.0
... ... ... ...
2021-11-19, 12h 71.692308 71.079 2133.0
2021-11-19, 13h 74.600000 119.868 2313.0
2021-11-19, 14h 77.500000 77.796 1645.0
2021-11-19, 15h 73.545455 96.982 555.0
2021-11-19, 16h 70.571429 63.335 1460.0

1906 rows × 3 columns

In [14]:
#rename merged columns
calhrsteps_merged = calhrsteps_merged.rename(columns = {'value_x': 'HR', 'value_y':'Cal', 'value':'Steps'})
calhrsteps_merged
Out[14]:
HR Cal Steps
creationdate
2021-06-01, 00h 54.619877 76.537 8.0
2021-06-01, 05h 53.461538 72.339 3.0
2021-06-01, 06h 69.000000 96.094 774.0
2021-06-01, 07h 75.500000 93.786 1021.0
2021-06-01, 08h 86.625000 98.991 1799.0
... ... ... ...
2021-11-19, 12h 71.692308 71.079 2133.0
2021-11-19, 13h 74.600000 119.868 2313.0
2021-11-19, 14h 77.500000 77.796 1645.0
2021-11-19, 15h 73.545455 96.982 555.0
2021-11-19, 16h 70.571429 63.335 1460.0

1906 rows × 3 columns

In [15]:
calhrsteps_merged.reset_index(inplace=True)
calhrsteps_merged.head(30)
Out[15]:
creationdate HR Cal Steps
0 2021-06-01, 00h 54.619877 76.537 8.0
1 2021-06-01, 05h 53.461538 72.339 3.0
2 2021-06-01, 06h 69.000000 96.094 774.0
3 2021-06-01, 07h 75.500000 93.786 1021.0
4 2021-06-01, 08h 86.625000 98.991 1799.0
5 2021-06-01, 09h 78.000000 99.325 290.0
6 2021-06-01, 10h 80.857143 104.291 972.0
7 2021-06-01, 11h 88.000000 72.936 2082.0
8 2021-06-01, 12h 84.272727 91.639 579.0
9 2021-06-01, 13h 87.673471 96.992 351.0
10 2021-06-01, 14h 82.444444 98.936 1652.0
11 2021-06-01, 15h 81.000000 99.292 1259.0
12 2021-06-01, 16h 73.454545 71.236 422.0
13 2021-06-01, 17h 73.100000 80.946 216.0
14 2021-06-01, 18h 81.361407 87.330 145.0
15 2021-06-01, 22h 62.466881 78.590 12.0
16 2021-06-02, 00h 53.871187 55.240 11.0
17 2021-06-02, 01h 53.642857 70.145 9.0
18 2021-06-02, 02h 51.868660 80.394 27.0
19 2021-06-02, 03h 52.785714 80.767 18.0
20 2021-06-02, 04h 52.720033 73.148 12.0
21 2021-06-02, 06h 73.250000 88.391 765.0
22 2021-06-02, 07h 78.384615 68.388 833.0
23 2021-06-02, 08h 87.812500 109.295 2823.0
24 2021-06-02, 09h 79.910567 82.883 396.0
25 2021-06-02, 10h 76.375000 104.784 416.0
26 2021-06-02, 11h 88.466667 97.964 2541.0
27 2021-06-02, 12h 85.166667 97.647 1052.0
28 2021-06-02, 13h 82.250000 95.467 407.0
29 2021-06-02, 14h 78.500000 95.521 1637.0
In [16]:
#creationdate back to datetime object for plotting
calhrsteps_merged1 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR'])
calhrsteps_merged1['creationdate']=pd.to_datetime(calhrsteps_merged1['creationdate'])

#plot the time series
plt.figure(figsize=(40,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged1, marker='s', markerfacecolor = 'blue', linewidth=2, color='blue')
plt.legend()
plt.title('HR during 2021')
plt.xlabel('Date')
plt.ylabel('HR')
plt.show()
In [17]:
#regress HR on steps and calorie consumption
In [18]:
#I will use 10/1 as the cutoff for splitting into train and test data
split_date = '2021-10-01, 00h'
df_training = calhrsteps_merged[calhrsteps_merged['creationdate'] < split_date]
df_test = calhrsteps_merged.loc[calhrsteps_merged['creationdate'] >= split_date]
print(df_test)
         creationdate         HR      Cal   Steps
1350  2021-10-01, 01h  48.753457   75.107    10.0
1351  2021-10-01, 02h  47.714286   70.797     8.0
1352  2021-10-01, 04h  49.428571   74.169    16.0
1353  2021-10-01, 05h  47.200000   71.879     9.0
1354  2021-10-01, 06h  63.250000   88.045   926.0
...               ...        ...      ...     ...
1901  2021-11-19, 12h  71.692308   71.079  2133.0
1902  2021-11-19, 13h  74.600000  119.868  2313.0
1903  2021-11-19, 14h  77.500000   77.796  1645.0
1904  2021-11-19, 15h  73.545455   96.982   555.0
1905  2021-11-19, 16h  70.571429   63.335  1460.0

[556 rows x 4 columns]
In [19]:
print(df_training.shape)
print(df_test.shape)
(1350, 4)
(556, 4)
In [20]:
#use sklearn for regression and calculating expected HR
x_train = df_training[['Cal', 'Steps']]
y_train = df_training['HR']
In [21]:
#train model
model = LinearRegression().fit(x_train,y_train)
In [22]:
#determine intercept and coefficient values
print('Intercept: ', model.intercept_)
print('Coefficients: ', model.coef_)
Intercept:  68.69623189976278
Coefficients:  [0.00271917 0.00946637]
In [23]:
r_sq = model.score(x_train,y_train)
print(f"coefficient of determination: {r_sq}")
coefficient of determination: 0.2630705298599707
In [24]:
#examine regression model built on training data
x_train = sm.add_constant(x_train)
result = sm.OLS(y_train, x_train).fit()
print(result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     HR   R-squared:                       0.263
Model:                            OLS   Adj. R-squared:                  0.262
Method:                 Least Squares   F-statistic:                     240.4
Date:                Wed, 07 Dec 2022   Prob (F-statistic):           5.14e-90
Time:                        22:32:31   Log-Likelihood:                -5590.5
No. Observations:                1350   AIC:                         1.119e+04
Df Residuals:                    1347   BIC:                         1.120e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         68.6962      0.631    108.930      0.000      67.459      69.933
Cal            0.0027      0.003      0.971      0.332      -0.003       0.008
Steps          0.0095      0.000     21.861      0.000       0.009       0.010
==============================================================================
Omnibus:                      383.663   Durbin-Watson:                   1.044
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1028.247
Skew:                           1.480   Prob(JB):                    5.23e-224
Kurtosis:                       6.085   Cond. No.                     1.98e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.98e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [25]:
#run model on test data
x_test = df_test[['Cal', 'Steps']]
y_test = df_test['HR']
y_pred = model.predict(x_test)
In [26]:
#evaluate regression model run on test data
print(f"MAE:  {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"MSE:  {metrics.mean_squared_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")
MAE:  12.488762178227297
MSE:  291.6190893621918
RMSE: 17.076858298943392
In [27]:
#regress heart rate on steps and calorie consumption to find expected HR 
x = calhrsteps_merged[['Cal', 'Steps']]
y = calhrsteps_merged['HR']
y_exp = model.predict(x)
calhrsteps_merged['ExpHR'] = y_exp
calhrsteps_merged
Out[27]:
creationdate HR Cal Steps ExpHR
0 2021-06-01, 00h 54.619877 76.537 8.0 68.980080
1 2021-06-01, 05h 53.461538 72.339 3.0 68.921333
2 2021-06-01, 06h 69.000000 96.094 774.0 76.284502
3 2021-06-01, 07h 75.500000 93.786 1021.0 78.616421
4 2021-06-01, 08h 86.625000 98.991 1799.0 85.995413
... ... ... ... ... ...
1901 2021-11-19, 12h 71.692308 71.079 2133.0 89.081285
1902 2021-11-19, 13h 74.600000 119.868 2313.0 90.917898
1903 2021-11-19, 14h 77.500000 77.796 1645.0 84.479959
1904 2021-11-19, 15h 73.545455 96.982 555.0 74.213781
1905 2021-11-19, 16h 70.571429 63.335 1460.0 82.689357

1906 rows × 5 columns

In [28]:
#create histogram
#check for normal distribution
calhrsteps_merged[["HR"]].plot.hist()
Out[28]:
<AxesSubplot:ylabel='Frequency'>
In [29]:
#report the r square of the regression
r_sq = model.score(x,y)
print(f"coefficient of determination: {r_sq}")
coefficient of determination: 0.23570219278462223
In [30]:
### TUKEY 
In [31]:
#sort by expected HR
calhrsteps_merged.sort_values(by=['ExpHR']).reset_index(drop=True)
Out[31]:
creationdate HR Cal Steps ExpHR
0 2021-09-14, 05h 48.181818 59.311 2.0 68.876442
1 2021-06-26, 01h 58.933333 58.528 3.0 68.883779
2 2021-06-29, 03h 56.000000 62.051 3.0 68.893358
3 2021-10-25, 23h 50.603885 73.518 1.0 68.905606
4 2021-09-14, 02h 47.969573 64.152 4.0 68.908538
... ... ... ... ... ...
1901 2021-06-18, 16h 109.954545 84.125 6196.0 127.578639
1902 2021-06-18, 18h 113.461538 77.626 6543.0 130.845799
1903 2021-10-03, 14h 86.421053 94.084 6850.0 133.796728
1904 2021-06-19, 17h 111.260870 106.856 6940.0 134.683431
1905 2021-10-24, 13h 88.946535 108.977 9130.0 155.420558

1906 rows × 5 columns

In [32]:
calhrsteps_merged.describe()
Out[32]:
HR Cal Steps ExpHR
count 1906.000000 1906.000000 1906.000000 1906.000000
mean 77.011142 109.267717 933.345750 77.828750
std 18.054119 151.505854 970.212795 9.201680
min 46.000000 30.410000 1.000000 68.876442
25% 66.918269 76.663000 154.500000 70.445414
50% 74.000000 90.321000 658.000000 75.319080
75% 82.433333 98.106000 1451.250000 82.726656
max 154.861511 1888.441000 9130.000000 155.420558
In [33]:
#fourth spreads
calhrsteps_merged['lower 4th']= 70.445414
calhrsteps_merged['upper 4th']= 82.726656
calhrsteps_merged['Fourth_spreads']=calhrsteps_merged['upper 4th']-calhrsteps_merged['lower 4th']

calhrsteps_merged
Out[33]:
creationdate HR Cal Steps ExpHR lower 4th upper 4th Fourth_spreads
0 2021-06-01, 00h 54.619877 76.537 8.0 68.980080 70.445414 82.726656 12.281242
1 2021-06-01, 05h 53.461538 72.339 3.0 68.921333 70.445414 82.726656 12.281242
2 2021-06-01, 06h 69.000000 96.094 774.0 76.284502 70.445414 82.726656 12.281242
3 2021-06-01, 07h 75.500000 93.786 1021.0 78.616421 70.445414 82.726656 12.281242
4 2021-06-01, 08h 86.625000 98.991 1799.0 85.995413 70.445414 82.726656 12.281242
... ... ... ... ... ... ... ... ...
1901 2021-11-19, 12h 71.692308 71.079 2133.0 89.081285 70.445414 82.726656 12.281242
1902 2021-11-19, 13h 74.600000 119.868 2313.0 90.917898 70.445414 82.726656 12.281242
1903 2021-11-19, 14h 77.500000 77.796 1645.0 84.479959 70.445414 82.726656 12.281242
1904 2021-11-19, 15h 73.545455 96.982 555.0 74.213781 70.445414 82.726656 12.281242
1905 2021-11-19, 16h 70.571429 63.335 1460.0 82.689357 70.445414 82.726656 12.281242

1906 rows × 8 columns

In [34]:
#calculate ucl and lcl
calhrsteps_merged['UCL']=calhrsteps_merged['upper 4th']+1.5*calhrsteps_merged['Fourth_spreads']
calhrsteps_merged['LCL']=calhrsteps_merged['upper 4th']-1.5*calhrsteps_merged['Fourth_spreads']
calhrsteps_merged
Out[34]:
creationdate HR Cal Steps ExpHR lower 4th upper 4th Fourth_spreads UCL LCL
0 2021-06-01, 00h 54.619877 76.537 8.0 68.980080 70.445414 82.726656 12.281242 101.148519 64.304793
1 2021-06-01, 05h 53.461538 72.339 3.0 68.921333 70.445414 82.726656 12.281242 101.148519 64.304793
2 2021-06-01, 06h 69.000000 96.094 774.0 76.284502 70.445414 82.726656 12.281242 101.148519 64.304793
3 2021-06-01, 07h 75.500000 93.786 1021.0 78.616421 70.445414 82.726656 12.281242 101.148519 64.304793
4 2021-06-01, 08h 86.625000 98.991 1799.0 85.995413 70.445414 82.726656 12.281242 101.148519 64.304793
... ... ... ... ... ... ... ... ... ... ...
1901 2021-11-19, 12h 71.692308 71.079 2133.0 89.081285 70.445414 82.726656 12.281242 101.148519 64.304793
1902 2021-11-19, 13h 74.600000 119.868 2313.0 90.917898 70.445414 82.726656 12.281242 101.148519 64.304793
1903 2021-11-19, 14h 77.500000 77.796 1645.0 84.479959 70.445414 82.726656 12.281242 101.148519 64.304793
1904 2021-11-19, 15h 73.545455 96.982 555.0 74.213781 70.445414 82.726656 12.281242 101.148519 64.304793
1905 2021-11-19, 16h 70.571429 63.335 1460.0 82.689357 70.445414 82.726656 12.281242 101.148519 64.304793

1906 rows × 10 columns

In [35]:
calhrsteps_merged2 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR','UCL','LCL'])
calhrsteps_merged2['creationdate']=pd.to_datetime(calhrsteps_merged2['creationdate'])

plt.figure(figsize=(20,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged2, marker='s', color='blue',linewidth=2)
plt.plot('creationdate','UCL', data=calhrsteps_merged2, color='red',linewidth=2)
plt.plot('creationdate','LCL', data=calhrsteps_merged2, color='red',linewidth=2)
plt.legend()
plt.title('Tukey - All Data')
plt.xlabel('Date')
plt.ylabel('Heart Rate (beats per min)')
plt.show()
In [36]:
#plot 2 days worth of data only
calhrsteps_merged3 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR','UCL','LCL'])
calhrsteps_merged3['creationdate']=pd.to_datetime(calhrsteps_merged3['creationdate'])

split_date = datetime.datetime(2021, 11, 18)
calhrsteps_merged3 = calhrsteps_merged3.loc[calhrsteps_merged3['creationdate'] > split_date]
calhrsteps_merged3
Out[36]:
creationdate HR UCL LCL
1882 2021-11-18 07:00:00 71.333333 101.148519 64.304793
1883 2021-11-18 08:00:00 69.357143 101.148519 64.304793
1884 2021-11-18 09:00:00 70.211635 101.148519 64.304793
1885 2021-11-18 10:00:00 60.750000 101.148519 64.304793
1886 2021-11-18 11:00:00 62.916667 101.148519 64.304793
1887 2021-11-18 12:00:00 68.554154 101.148519 64.304793
1888 2021-11-18 13:00:00 73.000000 101.148519 64.304793
1889 2021-11-18 14:00:00 96.384615 101.148519 64.304793
1890 2021-11-18 15:00:00 89.142857 101.148519 64.304793
1891 2021-11-18 16:00:00 71.083333 101.148519 64.304793
1892 2021-11-18 17:00:00 105.424528 101.148519 64.304793
1893 2021-11-18 18:00:00 109.230534 101.148519 64.304793
1894 2021-11-18 19:00:00 74.375000 101.148519 64.304793
1895 2021-11-18 22:00:00 153.453800 101.148519 64.304793
1896 2021-11-19 07:00:00 66.000000 101.148519 64.304793
1897 2021-11-19 08:00:00 81.312500 101.148519 64.304793
1898 2021-11-19 09:00:00 76.721756 101.148519 64.304793
1899 2021-11-19 10:00:00 67.909091 101.148519 64.304793
1900 2021-11-19 11:00:00 69.923077 101.148519 64.304793
1901 2021-11-19 12:00:00 71.692308 101.148519 64.304793
1902 2021-11-19 13:00:00 74.600000 101.148519 64.304793
1903 2021-11-19 14:00:00 77.500000 101.148519 64.304793
1904 2021-11-19 15:00:00 73.545455 101.148519 64.304793
1905 2021-11-19 16:00:00 70.571429 101.148519 64.304793
In [37]:
plt.figure(figsize=(20,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged3, marker='s', color='blue',linewidth=2)
plt.plot('creationdate','UCL', data=calhrsteps_merged3, color='red',linewidth=2)
plt.plot('creationdate','LCL', data=calhrsteps_merged3, color='red',linewidth=2)
plt.legend()
plt.title('Tukey - HR from 11/18/21-11/19/21')
plt.xlabel('Date')
plt.ylabel('Heart Rate (beats per min)')
plt.show()
In [ ]: