Chapter 1 (Introduction)
"Statistics starts with a problem, proceeds with the collection of data, continues with the data analysis and finishes with conclusions" - Faraway, 2014 -
"The formulation of a problem is often more essential than its solution, which may be merely a matter of mathematical or experimental skill." - Albert Einstein In Einstein and Infeld, Evolution of Physics (1938) -
Data Preparation Using R
require(faraway)
## Loading required package: faraway
data(pima, package="faraway")
head(pima)
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 6 148 72 35 0 33.6 0.627 50 1
## 2 1 85 66 29 0 26.6 0.351 31 0
## 3 8 183 64 0 0 23.3 0.672 32 1
## 4 1 89 66 23 94 28.1 0.167 21 0
## 5 0 137 40 35 168 43.1 2.288 33 1
## 6 5 116 74 0 0 25.6 0.201 30 0
summary(pima)
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin bmi diabetes age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## test
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
## There is a potential failure in our data. Let's check it out using data sorted.
sort(pima$diastolic)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [19] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24
## [37] 30 30 38 40 44 44 44 44 46 46 48 48 48 48 48 50 50 50
## [55] 50 50 50 50 50 50 50 50 50 50 52 52 52 52 52 52 52 52
## [73] 52 52 52 54 54 54 54 54 54 54 54 54 54 54 55 55 56 56
## [91] 56 56 56 56 56 56 56 56 56 56 58 58 58 58 58 58 58 58
## [109] 58 58 58 58 58 58 58 58 58 58 58 58 58 60 60 60 60 60
## [127] 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60
## [145] 60 60 60 60 60 60 60 60 60 60 60 60 60 60 61 62 62 62
## [163] 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62
## [181] 62 62 62 62 62 62 62 62 62 62 62 62 62 64 64 64 64 64
## [199] 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
## [217] 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
## [235] 64 64 65 65 65 65 65 65 65 66 66 66 66 66 66 66 66 66
## [253] 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66
## [271] 66 66 66 68 68 68 68 68 68 68 68 68 68 68 68 68 68 68
## [289] 68 68 68 68 68 68 68 68 68 68 68 68 68 68 68 68 68 68
## [307] 68 68 68 68 68 68 68 68 68 68 68 68 70 70 70 70 70 70
## [325] 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70
## [343] 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70
## [361] 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 72 72 72
## [379] 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72
## [397] 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72 72
## [415] 72 72 72 72 72 74 74 74 74 74 74 74 74 74 74 74 74 74
## [433] 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74
## [451] 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74 74
## [469] 74 74 74 75 75 75 75 75 75 75 75 76 76 76 76 76 76 76
## [487] 76 76 76 76 76 76 76 76 76 76 76 76 76 76 76 76 76 76
## [505] 76 76 76 76 76 76 76 76 76 76 76 76 76 76 78 78 78 78
## [523] 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78
## [541] 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78 78
## [559] 78 78 78 78 78 80 80 80 80 80 80 80 80 80 80 80 80 80
## [577] 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80
## [595] 80 80 80 80 80 80 80 80 80 82 82 82 82 82 82 82 82 82
## [613] 82 82 82 82 82 82 82 82 82 82 82 82 82 82 82 82 82 82
## [631] 82 82 82 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84
## [649] 84 84 84 84 84 84 84 84 85 85 85 85 85 85 86 86 86 86
## [667] 86 86 86 86 86 86 86 86 86 86 86 86 86 86 86 86 86 88
## [685] 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88
## [703] 88 88 88 88 88 88 90 90 90 90 90 90 90 90 90 90 90 90
## [721] 90 90 90 90 90 90 90 90 90 90 92 92 92 92 92 92 92 92
## [739] 94 94 94 94 94 94 95 96 96 96 96 98 98 98 100 100 100 102
## [757] 104 104 106 106 106 108 108 110 110 110 114 122
## We found that the first 35 data equals to 0. Is this real 0?
pima$diastolic[pima$diastolic == 0] <- NA
pima$glucose[pima$glucose == 0] <- NA
pima$triceps[pima$triceps == 0] <- NA
pima$insulin[pima$insulin == 0] <- NA
pima$bmi[pima$bmi == 0] <- NA
pima$test <- factor(pima$test)
summary(pima$test)
## 0 1
## 500 268
levels(pima$test) <- c("negative","positive")
summary(pima)
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## insulin bmi diabetes age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## test
## negative:500
## positive:268
##
##
##
##
##
"Good graphics are vital in data analysis. In some cases, the graphics can be so convincing that the formal analysis becomes just a confirmation of what has already been seen." - Faraway, 2014 -
You can also embed plots, for example:
hist(pima$diastolic,xlab="Diastolic",main="")
plot(density(pima$diastolic,na.rm=TRUE),main="")
plot(sort(pima$diastolic),ylab="Sorted Diastolic")
plot(diabetes ~ diastolic,pima)
plot(diabetes ~ test,pima)
require(ggplot2)
## Loading required package: ggplot2
ggplot(pima,aes(x=diastolic))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35 rows containing non-finite values (stat_bin).
ggplot(pima,aes(x=diastolic))+geom_density()
## Warning: Removed 35 rows containing non-finite values (stat_density).
ggplot(pima,aes(x=diastolic,y=diabetes))+geom_point()
## Warning: Removed 35 rows containing missing values (geom_point).
ggplot(pima,aes(x=diastolic,y=diabetes,shape=test))+geom_point()+theme(legend.position = "top", legend.direction = "horizontal")
## Warning: Removed 35 rows containing missing values (geom_point).
ggplot(pima,aes(x=diastolic,y=diabetes)) + geom_point(size=1) + facet_grid(~ test)
## Warning: Removed 35 rows containing missing values (geom_point).
data(manilius, package="faraway")
head(manilius)
## arc sinang cosang group
## 1 13.16667 0.8836 -0.4682 1
## 2 13.13333 0.9996 -0.0282 1
## 3 13.20000 0.9899 0.1421 1
## 4 14.25000 0.2221 0.9750 3
## 5 14.70000 0.0006 1.0000 3
## 6 13.01667 0.9308 -0.3654 1
(moon3 <- aggregate(manilius[,1:3],list(manilius$group), sum))
## Group.1 arc sinang cosang
## 1 1 118.1333 8.4987 -0.7932
## 2 2 140.2833 -6.1404 1.7443
## 3 3 127.5333 2.9777 7.9649
solve(cbind(9,moon3$sinang,moon3$cosang), moon3$arc)
## [1] 14.5445859 -1.4898221 0.1341264
lmod <- lm(arc ~ sinang + cosang, manilius)
coef(lmod)
## (Intercept) sinang cosang
## 14.56162351 -1.50458123 0.09136504
data(GaltonFamilies, package="HistData")
plot(childHeight ~ midparentHeight, GaltonFamilies)
lmod <- lm(childHeight ~ midparentHeight, GaltonFamilies)
coef(lmod)
## (Intercept) midparentHeight
## 22.6362405 0.6373609
abline(lmod)
(beta <- with(GaltonFamilies, cor(midparentHeight, childHeight) * sd(childHeight) / sd(midparentHeight)))
## [1] 0.6373609
(alpha <- with(GaltonFamilies, mean(childHeight) - beta * mean(midparentHeight)))
## [1] 22.63624
(beta1 <- with(GaltonFamilies, sd(childHeight) / sd(midparentHeight)))
## [1] 1.985858
(alpha1 <- with(GaltonFamilies, mean(childHeight) - beta1 * mean(midparentHeight)))
## [1] -70.68889
abline(alpha1, beta1, lty=2)
References
Faraway, J. J. (2004). Linear Models with R. In Linear Models with R. https://doi.org/10.4324/9780203507278
Faraway, J. (2009). Texts in Statistical Science: Linear Models with R. In Taylor and Francis Group.
Faraway, J. (2014). Texts in Statistical Science: Linear Models with R. Chapman & Hall/CRC Press. 274 pages, ISBN-13: 9781439887332.
0 comments :
Post a Comment