import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm

### ANALYZE USING TUKEY

hr_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_HR_data.csv")
cal_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_calories_data.csv")
steps_data = pd.read_csv(r"C:\Users\pktra\Desktop\sample_steps_data.csv")

C:\Users\pktra\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3457: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)

cal_data.head()

steps_data.head()

#filter to only columns we need 
hr_data = pd.DataFrame(hr_data, columns=['creationdate','value'])
cal_data = pd.DataFrame(cal_data, columns=['creationdate','value'])
steps_data = pd.DataFrame(steps_data, columns=['creationdate','value'])

#convert types
hr_data['value'].astype('float')
cal_data['value'].astype('float')
steps_data['value'].astype('float')

0          85.0
1          10.0
2         154.0
3         371.0
4          95.0
          ...  
188331    235.0
188332    204.0
188333     10.0
188334     58.0
188335      8.0
Name: value, Length: 188336, dtype: float64

hr_data['creationdate']=pd.to_datetime(hr_data['creationdate'])
cal_data['creationdate']=pd.to_datetime(cal_data['creationdate'])
steps_data['creationdate']=pd.to_datetime(steps_data['creationdate'])

#reduce timeframe of analysis to 6/2021-12/2021 data only
split_date = datetime.datetime(2021, 6, 1)
hr_data = hr_data.loc[hr_data['creationdate'] > split_date]
cal_data = cal_data.loc[cal_data['creationdate'] > split_date]
steps_data = steps_data.loc[steps_data['creationdate'] > split_date]

hr_data['creationdate'] = hr_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
cal_data['creationdate'] = cal_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")
steps_data['creationdate'] = steps_data['creationdate'].dt.strftime("%Y-%m-%d, %Hh")

hr_data.head()

#remove nearly impossible HR values
hr_data=hr_data[hr_data.value < 220] #cannot exceed max heart rate
hr_data=hr_data[hr_data.value > 38] #cannot be bradycardic

#group hr data by the hour 
hrmean = hr_data.groupby('creationdate').mean()
hrmean

#group calorie data by the hour using sum energy expenditure
calsum = cal_data.groupby('creationdate').sum()
calsum

#remove nearly impossible calorie expenditure (average person burns 45 calories per hour, at rest)
calsum=calsum[calsum.value > 30]
calsum

#group steps data by the hour using sum steps
stepssum = steps_data.groupby('creationdate').sum()
stepssum

#merge data into single df with 1 observation per time period
calhrsteps_merged = hrmean.merge(calsum, how='inner', on='creationdate').merge(stepssum, how='inner', on='creationdate')
calhrsteps_merged

#rename merged columns
calhrsteps_merged = calhrsteps_merged.rename(columns = {'value_x': 'HR', 'value_y':'Cal', 'value':'Steps'})
calhrsteps_merged

calhrsteps_merged.reset_index(inplace=True)
calhrsteps_merged.head(30)

#creationdate back to datetime object for plotting
calhrsteps_merged1 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR'])
calhrsteps_merged1['creationdate']=pd.to_datetime(calhrsteps_merged1['creationdate'])

#plot the time series
plt.figure(figsize=(40,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged1, marker='s', markerfacecolor = 'blue', linewidth=2, color='blue')
plt.legend()
plt.title('HR during 2021')
plt.xlabel('Date')
plt.ylabel('HR')
plt.show()

#regress HR on steps and calorie consumption

#I will use 10/1 as the cutoff for splitting into train and test data
split_date = '2021-10-01, 00h'
df_training = calhrsteps_merged[calhrsteps_merged['creationdate'] < split_date]
df_test = calhrsteps_merged.loc[calhrsteps_merged['creationdate'] >= split_date]
print(df_test)

         creationdate         HR      Cal   Steps
1350  2021-10-01, 01h  48.753457   75.107    10.0
1351  2021-10-01, 02h  47.714286   70.797     8.0
1352  2021-10-01, 04h  49.428571   74.169    16.0
1353  2021-10-01, 05h  47.200000   71.879     9.0
1354  2021-10-01, 06h  63.250000   88.045   926.0
...               ...        ...      ...     ...
1901  2021-11-19, 12h  71.692308   71.079  2133.0
1902  2021-11-19, 13h  74.600000  119.868  2313.0
1903  2021-11-19, 14h  77.500000   77.796  1645.0
1904  2021-11-19, 15h  73.545455   96.982   555.0
1905  2021-11-19, 16h  70.571429   63.335  1460.0

[556 rows x 4 columns]

print(df_training.shape)
print(df_test.shape)

(1350, 4)
(556, 4)

#use sklearn for regression and calculating expected HR
x_train = df_training[['Cal', 'Steps']]
y_train = df_training['HR']

#train model
model = LinearRegression().fit(x_train,y_train)

#determine intercept and coefficient values
print('Intercept: ', model.intercept_)
print('Coefficients: ', model.coef_)

Intercept:  68.69623189976278
Coefficients:  [0.00271917 0.00946637]

r_sq = model.score(x_train,y_train)
print(f"coefficient of determination: {r_sq}")

coefficient of determination: 0.2630705298599707

#examine regression model built on training data
x_train = sm.add_constant(x_train)
result = sm.OLS(y_train, x_train).fit()
print(result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     HR   R-squared:                       0.263
Model:                            OLS   Adj. R-squared:                  0.262
Method:                 Least Squares   F-statistic:                     240.4
Date:                Wed, 07 Dec 2022   Prob (F-statistic):           5.14e-90
Time:                        22:32:31   Log-Likelihood:                -5590.5
No. Observations:                1350   AIC:                         1.119e+04
Df Residuals:                    1347   BIC:                         1.120e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         68.6962      0.631    108.930      0.000      67.459      69.933
Cal            0.0027      0.003      0.971      0.332      -0.003       0.008
Steps          0.0095      0.000     21.861      0.000       0.009       0.010
==============================================================================
Omnibus:                      383.663   Durbin-Watson:                   1.044
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1028.247
Skew:                           1.480   Prob(JB):                    5.23e-224
Kurtosis:                       6.085   Cond. No.                     1.98e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.98e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

#run model on test data
x_test = df_test[['Cal', 'Steps']]
y_test = df_test['HR']
y_pred = model.predict(x_test)

#evaluate regression model run on test data
print(f"MAE:  {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"MSE:  {metrics.mean_squared_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")

MAE:  12.488762178227297
MSE:  291.6190893621918
RMSE: 17.076858298943392

#regress heart rate on steps and calorie consumption to find expected HR 
x = calhrsteps_merged[['Cal', 'Steps']]
y = calhrsteps_merged['HR']
y_exp = model.predict(x)
calhrsteps_merged['ExpHR'] = y_exp
calhrsteps_merged

#create histogram
#check for normal distribution
calhrsteps_merged[["HR"]].plot.hist()

<AxesSubplot:ylabel='Frequency'>

#report the r square of the regression
r_sq = model.score(x,y)
print(f"coefficient of determination: {r_sq}")

coefficient of determination: 0.23570219278462223

### TUKEY

#sort by expected HR
calhrsteps_merged.sort_values(by=['ExpHR']).reset_index(drop=True)

calhrsteps_merged.describe()

#fourth spreads
calhrsteps_merged['lower 4th']= 70.445414
calhrsteps_merged['upper 4th']= 82.726656
calhrsteps_merged['Fourth_spreads']=calhrsteps_merged['upper 4th']-calhrsteps_merged['lower 4th']

calhrsteps_merged

#calculate ucl and lcl
calhrsteps_merged['UCL']=calhrsteps_merged['upper 4th']+1.5*calhrsteps_merged['Fourth_spreads']
calhrsteps_merged['LCL']=calhrsteps_merged['upper 4th']-1.5*calhrsteps_merged['Fourth_spreads']
calhrsteps_merged

calhrsteps_merged2 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR','UCL','LCL'])
calhrsteps_merged2['creationdate']=pd.to_datetime(calhrsteps_merged2['creationdate'])

plt.figure(figsize=(20,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged2, marker='s', color='blue',linewidth=2)
plt.plot('creationdate','UCL', data=calhrsteps_merged2, color='red',linewidth=2)
plt.plot('creationdate','LCL', data=calhrsteps_merged2, color='red',linewidth=2)
plt.legend()
plt.title('Tukey - All Data')
plt.xlabel('Date')
plt.ylabel('Heart Rate (beats per min)')
plt.show()

#plot 2 days worth of data only
calhrsteps_merged3 = pd.DataFrame(calhrsteps_merged, columns=['creationdate','HR','UCL','LCL'])
calhrsteps_merged3['creationdate']=pd.to_datetime(calhrsteps_merged3['creationdate'])

split_date = datetime.datetime(2021, 11, 18)
calhrsteps_merged3 = calhrsteps_merged3.loc[calhrsteps_merged3['creationdate'] > split_date]
calhrsteps_merged3

plt.figure(figsize=(20,10), dpi=80)
plt.xticks(rotation=90)
plt.plot('creationdate','HR',data=calhrsteps_merged3, marker='s', color='blue',linewidth=2)
plt.plot('creationdate','UCL', data=calhrsteps_merged3, color='red',linewidth=2)
plt.plot('creationdate','LCL', data=calhrsteps_merged3, color='red',linewidth=2)
plt.legend()
plt.title('Tukey - HR from 11/18/21-11/19/21')
plt.xlabel('Date')
plt.ylabel('Heart Rate (beats per min)')
plt.show()

	unit	creationdate	startdate	enddate	value	Unnamed: 5	Unnamed: 6
0	Cal	9/9/2018 10:54	9/9/2018 10:39	9/9/2018 10:54	18.258	NaN	NaN
1	Cal	9/9/2018 11:09	9/9/2018 10:54	9/9/2018 11:08	17.214	NaN	NaN
2	Cal	9/9/2018 11:24	9/9/2018 11:08	9/9/2018 11:24	18.665	NaN	NaN
3	Cal	9/9/2018 11:39	9/9/2018 11:24	9/9/2018 11:39	18.673	NaN	NaN
4	Cal	9/9/2018 11:54	9/9/2018 11:39	9/9/2018 11:53	17.429	NaN	NaN

	unit	creationdate	startdate	enddate	value
0	count	9/9/2018 12:24	9/9/2018 12:13	9/9/2018 12:23	85.0
1	count	9/9/2018 12:30	9/9/2018 12:28	9/9/2018 12:28	10.0
2	count	9/9/2018 13:02	9/9/2018 12:46	9/9/2018 12:56	154.0
3	count	9/9/2018 13:09	9/9/2018 12:56	9/9/2018 13:04	371.0
4	count	9/9/2018 13:22	9/9/2018 13:04	9/9/2018 13:14	95.0

	creationdate	value
161716	2021-06-01, 00h	52.0
161717	2021-06-01, 00h	54.0
161718	2021-06-01, 00h	55.0
161719	2021-06-01, 00h	54.0
161720	2021-06-01, 00h	55.0

	creationdate	HR	Cal	Steps
0	2021-06-01, 00h	54.619877	76.537	8.0
1	2021-06-01, 05h	53.461538	72.339	3.0
2	2021-06-01, 06h	69.000000	96.094	774.0
3	2021-06-01, 07h	75.500000	93.786	1021.0
4	2021-06-01, 08h	86.625000	98.991	1799.0
5	2021-06-01, 09h	78.000000	99.325	290.0
6	2021-06-01, 10h	80.857143	104.291	972.0
7	2021-06-01, 11h	88.000000	72.936	2082.0
8	2021-06-01, 12h	84.272727	91.639	579.0
9	2021-06-01, 13h	87.673471	96.992	351.0
10	2021-06-01, 14h	82.444444	98.936	1652.0
11	2021-06-01, 15h	81.000000	99.292	1259.0
12	2021-06-01, 16h	73.454545	71.236	422.0
13	2021-06-01, 17h	73.100000	80.946	216.0
14	2021-06-01, 18h	81.361407	87.330	145.0
15	2021-06-01, 22h	62.466881	78.590	12.0
16	2021-06-02, 00h	53.871187	55.240	11.0
17	2021-06-02, 01h	53.642857	70.145	9.0
18	2021-06-02, 02h	51.868660	80.394	27.0
19	2021-06-02, 03h	52.785714	80.767	18.0
20	2021-06-02, 04h	52.720033	73.148	12.0
21	2021-06-02, 06h	73.250000	88.391	765.0
22	2021-06-02, 07h	78.384615	68.388	833.0
23	2021-06-02, 08h	87.812500	109.295	2823.0
24	2021-06-02, 09h	79.910567	82.883	396.0
25	2021-06-02, 10h	76.375000	104.784	416.0
26	2021-06-02, 11h	88.466667	97.964	2541.0
27	2021-06-02, 12h	85.166667	97.647	1052.0
28	2021-06-02, 13h	82.250000	95.467	407.0
29	2021-06-02, 14h	78.500000	95.521	1637.0

	creationdate	HR	Cal	Steps	ExpHR
0	2021-09-14, 05h	48.181818	59.311	2.0	68.876442
1	2021-06-26, 01h	58.933333	58.528	3.0	68.883779
2	2021-06-29, 03h	56.000000	62.051	3.0	68.893358
3	2021-10-25, 23h	50.603885	73.518	1.0	68.905606
4	2021-09-14, 02h	47.969573	64.152	4.0	68.908538
...	...	...	...	...	...
1901	2021-06-18, 16h	109.954545	84.125	6196.0	127.578639
1902	2021-06-18, 18h	113.461538	77.626	6543.0	130.845799
1903	2021-10-03, 14h	86.421053	94.084	6850.0	133.796728
1904	2021-06-19, 17h	111.260870	106.856	6940.0	134.683431
1905	2021-10-24, 13h	88.946535	108.977	9130.0	155.420558

	HR	Cal	Steps	ExpHR
count	1906.000000	1906.000000	1906.000000	1906.000000
mean	77.011142	109.267717	933.345750	77.828750
std	18.054119	151.505854	970.212795	9.201680
min	46.000000	30.410000	1.000000	68.876442
25%	66.918269	76.663000	154.500000	70.445414
50%	74.000000	90.321000	658.000000	75.319080
75%	82.433333	98.106000	1451.250000	82.726656
max	154.861511	1888.441000	9130.000000	155.420558

	creationdate	HR	UCL	LCL
1882	2021-11-18 07:00:00	71.333333	101.148519	64.304793
1883	2021-11-18 08:00:00	69.357143	101.148519	64.304793
1884	2021-11-18 09:00:00	70.211635	101.148519	64.304793
1885	2021-11-18 10:00:00	60.750000	101.148519	64.304793
1886	2021-11-18 11:00:00	62.916667	101.148519	64.304793
1887	2021-11-18 12:00:00	68.554154	101.148519	64.304793
1888	2021-11-18 13:00:00	73.000000	101.148519	64.304793
1889	2021-11-18 14:00:00	96.384615	101.148519	64.304793
1890	2021-11-18 15:00:00	89.142857	101.148519	64.304793
1891	2021-11-18 16:00:00	71.083333	101.148519	64.304793
1892	2021-11-18 17:00:00	105.424528	101.148519	64.304793
1893	2021-11-18 18:00:00	109.230534	101.148519	64.304793
1894	2021-11-18 19:00:00	74.375000	101.148519	64.304793
1895	2021-11-18 22:00:00	153.453800	101.148519	64.304793
1896	2021-11-19 07:00:00	66.000000	101.148519	64.304793
1897	2021-11-19 08:00:00	81.312500	101.148519	64.304793
1898	2021-11-19 09:00:00	76.721756	101.148519	64.304793
1899	2021-11-19 10:00:00	67.909091	101.148519	64.304793
1900	2021-11-19 11:00:00	69.923077	101.148519	64.304793
1901	2021-11-19 12:00:00	71.692308	101.148519	64.304793
1902	2021-11-19 13:00:00	74.600000	101.148519	64.304793
1903	2021-11-19 14:00:00	77.500000	101.148519	64.304793
1904	2021-11-19 15:00:00	73.545455	101.148519	64.304793
1905	2021-11-19 16:00:00	70.571429	101.148519	64.304793