#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Import data files
d_201611=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20161110.csv', engine='python')
d_201608=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20160810.csv', engine='python')
d_201605=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20160504.csv', engine='python')
d_201512=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20151210.csv', engine='python')
d_201510=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20151008.csv', engine='python')
d_201507=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20150716.csv', engine='python')
d_201505=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20150506.csv', engine='python')
d_201504=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20150416.csv', engine='python')
d_201501=pd.read_csv (r'C:\Users\user1\Documents\muriyo\gmu\hap725\week5\q1\data\TimelyandEffectiveCare_Hospital_20150122.csv', engine='python')
#View data in dataframe for 201611
d_201611.head(2)
#Note that Provider ID for 201611 dataset is Integer while in other datasets it is a string value
#Sccore and Sample are also Object value and need to be converted to a float to allow for calculations
print("\033[1m" +'201611 data types' + "\033[0m")
print (d_201611.dtypes)
print("\033[1m" +'201608 data types' + "\033[0m")
print (d_201608.dtypes)
print("\033[1m" +'201605 data types' + "\033[0m")
print (d_201605.dtypes)
print("\033[1m" +'201512 data types' + "\033[0m")
print (d_201512.dtypes)
print("\033[1m" +'201510 data types' + "\033[0m")
print (d_201510.dtypes)
print("\033[1m" +'201507 data types' + "\033[0m")
print (d_201507.dtypes)
print("\033[1m" +'201505 data types' + "\033[0m")
print (d_201505.dtypes)
print("\033[1m" +'201504 data types' + "\033[0m")
print (d_201504.dtypes)
print("\033[1m" +'201501 data types' + "\033[0m")
print (d_201501.dtypes)
#Filter for SOUTHEAST ALABAMA MEDICAL CENTER for each year.
#Filter for City = Dothan
#Select measure ID "SCIP_INF_3"; this measure refers to prophylactic antibiotic use.
h = ['SOUTHEAST ALABAMA MEDICAL CENTER' ]
c = ['DOTHAN']
m = ['SCIP_INF_3']
d_201611s=pd.DataFrame(d_201611, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201611s = d_201611s[(d_201611s['Hospital Name'].isin(h)) & (d_201611s['City'].isin (c)) & (d_201611s['Measure ID'].isin (m))]
print("\033[1m" +'201611 samc Filtered Dataset' + "\033[0m")
print(d_201611s.head())
d_201608s=pd.DataFrame(d_201608, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201608s = d_201608s[(d_201608s['Hospital Name'].isin(h)) & (d_201608s['City'].isin (c)) & (d_201608s['Measure ID'].isin (m))]
print("\033[1m" +'201608 samc Filtered Dataset' + "\033[0m")
print(d_201608s.head())
d_201605s=pd.DataFrame(d_201605, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201605s = d_201605s[(d_201605s['Hospital Name'].isin(h)) & (d_201605s['City'].isin (c)) & (d_201605s['Measure ID'].isin (m))]
print("\033[1m" +'201605 samc Filtered Dataset' + "\033[0m")
print(d_201605s.head())
d_201512s=pd.DataFrame(d_201512, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201512s = d_201512s[(d_201512s['Hospital Name'].isin(h)) & (d_201512s['City'].isin (c)) & (d_201512s['Measure ID'].isin (m))]
print("\033[1m" +'201512 samc Filtered Dataset' + "\033[0m")
print(d_201512s.head())
d_201510s=pd.DataFrame(d_201510, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201510s = d_201510s[(d_201510s['Hospital Name'].isin(h)) & (d_201510s['City'].isin (c)) & (d_201510s['Measure ID'].isin (m))]
print("\033[1m" +'201510 samc Filtered Dataset' + "\033[0m")
print(d_201510s.head())
d_201507s=pd.DataFrame(d_201507, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201507s = d_201507s[(d_201507s['Hospital Name'].isin(h)) & (d_201507s['City'].isin (c)) & (d_201507s['Measure ID'].isin (m))]
print("\033[1m" +'201507 samc Filtered Dataset' + "\033[0m")
print(d_201507s.head())
d_201505s=pd.DataFrame(d_201505, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201505s = d_201505s[(d_201505s['Hospital Name'].isin(h)) & (d_201505s['City'].isin (c)) & (d_201505s['Measure ID'].isin (m))]
print("\033[1m" +'201505 samc Filtered Dataset' + "\033[0m")
print(d_201505s.head())
d_201504s=pd.DataFrame(d_201504, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201504s = d_201505s[(d_201504s['Hospital Name'].isin(h)) & (d_201504s['City'].isin (c)) & (d_201504s['Measure ID'].isin (m))]
print("\033[1m" +'201504 samc Filtered Dataset' + "\033[0m")
print(d_201504s.head())
d_201501s=pd.DataFrame(d_201501, columns =['Provider ID', 'Hospital Name', 'City', 'State', 'Measure ID',
'Score', 'Sample', 'Measure Start Date', 'Measure End Date'] )
d_201501s = d_201501s[(d_201501s['Hospital Name'].isin(h)) & (d_201501s['City'].isin (c)) & (d_201501s['Measure ID'].isin (m))]
print("\033[1m" +'201501 samc Filtered Dataset' + "\033[0m")
print(d_201501s.head())
#Find number of rows and columns for the 9 data frames
print(d_201611s.shape)
print(d_201608s.shape)
print(d_201605s.shape)
print(d_201512s.shape)
print(d_201510s.shape)
print(d_201507s.shape)
print(d_201505s.shape)
print(d_201504s.shape)
print(d_201501s.shape)
#Check resulting dataframe for 201611s
d_201611s.head()
#Check for non numeric values in the HCAHPS Linear Mean Value column for the above data frames.
#The traceback feedback includes a ValueError and shows that it could not convert the 'Not Applicable' string to a float.
d_201611s['Score'].astype('float')
#Append the 9 data frames into one dataset
dfs = [d_201611s, d_201608s, d_201605s, d_201512s, d_201510s,d_201507s, d_201505s, d_201504s,d_201501s ]
samc = pd.concat(dfs)
samc
samc.dtypes
#Convert data types
samc['Score'] = pd.to_numeric(samc['Score'])
samc['Sample'] = pd.to_numeric(samc['Sample'])
samc['Measure Start Date'] = pd.to_datetime(samc['Measure Start Date'])
samc['Measure End Date'] = pd.to_datetime(samc['Measure End Date'])
samc.dtypes
#Calcuate Midway Point: ( [Measure Start Date] + [Measure End Data] ) / 2
samc['MidP'] = samc['Measure Start Date'] + (samc['Measure End Date'] - samc['Measure Start Date'])/2
#Calcuate Rate of Overuse: [Score] / [Sample]
samc['RateOveruse'] = samc['Score']/samc['Sample']
#Grand Rate: SUM (Score) / SUM (Sample)
samc['Grandrate'] = sum(samc['Score'])/sum(samc['Sample'])
#Upper Limit: Grand Rate + 1.96*SQRT((Grand Rate*(1-Grand Rate))/Sample)
#Lower Limit: Grand Rate - 1.96*SQRT((Grand Rate*(1-Grand Rate))/Sample)
samc['UCL'] = samc['Grandrate'] + (1.96 * np.sqrt((samc['Grandrate'] * (1 - samc['Grandrate']))/samc['Sample']))
samc['LCL'] = samc['Grandrate'] - (1.96 * np.sqrt((samc['Grandrate'] * (1 - samc['Grandrate']))/samc['Sample']))
samc
#Create the p-chart
plt.figure(figsize=(15,5))
plt.plot('Measure Start Date', 'RateOveruse', data=samc, marker = 's', markerfacecolor = 'blue', linewidth =1, color = 'blue')
plt.plot('Measure Start Date', 'UCL', data=samc, markersize = 0, color = 'red', linewidth =1)
plt.plot('Measure Start Date', 'LCL', data=samc, markersize = 0, color = 'red', linewidth =1)
plt.rotation=60
plt.legend()
plt.title('Rate of Post-Op Antibiotic Overuse: Apr 2013 - Jan 2015')
plt.xlabel('Measure Start Date')
plt.ylabel('Rate of Overuse')