Chapter 3 (Inference)
library(faraway)
data(gala, package="faraway")
lmod <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, gala)
nullmod <- lm(Species ~ 1, gala)
anova(nullmod, lmod)
## Analysis of Variance Table
##
## Model 1: Species ~ 1
## Model 2: Species ~ Area + Elevation + Nearest + Scruz + Adjacent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 29 381081
## 2 24 89231 5 291850 15.699 6.838e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(rss0 <- deviance(nullmod))
## [1] 381081.4
(rss <- deviance(lmod))
## [1] 89231.37
(df0 <- df.residual(nullmod))
## [1] 29
(df <- df.residual(lmod))
## [1] 24
(fstat <- ((rss0-rss)/(df0-df))/(rss/df))
## [1] 15.69941
1-pf(fstat, df0-df, df)
## [1] 6.837893e-07
lmods <- lm(Species ~ Elevation + Nearest + Scruz + Adjacent, gala)
anova(lmods, lmod)
## Analysis of Variance Table
##
## Model 1: Species ~ Elevation + Nearest + Scruz + Adjacent
## Model 2: Species ~ Area + Elevation + Nearest + Scruz + Adjacent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 25 93469
## 2 24 89231 1 4237.7 1.1398 0.2963
sumary(lmod)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.068221 19.154198 0.3690 0.7153508
## Area -0.023938 0.022422 -1.0676 0.2963180
## Elevation 0.319465 0.053663 5.9532 3.823e-06
## Nearest 0.009144 1.054136 0.0087 0.9931506
## Scruz -0.240524 0.215402 -1.1166 0.2752082
## Adjacent -0.074805 0.017700 -4.2262 0.0002971
##
## n = 30, p = 6, Residual SE = 60.97519, R-Squared = 0.77
sumary(lm(Species ~ Area, gala))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.782861 17.524416 3.6397 0.0010943
## Area 0.081963 0.019713 4.1578 0.0002748
##
## n = 30, p = 2, Residual SE = 91.73159, R-Squared = 0.38
lmods <- lm(Species ~ Elevation + Nearest + Scruz, gala)
anova(lmods, lmod)
## Analysis of Variance Table
##
## Model 1: Species ~ Elevation + Nearest + Scruz
## Model 2: Species ~ Area + Elevation + Nearest + Scruz + Adjacent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 26 158292
## 2 24 89231 2 69060 9.2874 0.00103 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmods <- lm(Species ~ I(Area+Adjacent) + Elevation + Nearest + Scruz, gala)
anova(lmods, lmod)
## Analysis of Variance Table
##
## Model 1: Species ~ I(Area + Adjacent) + Elevation + Nearest + Scruz
## Model 2: Species ~ Area + Elevation + Nearest + Scruz + Adjacent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 25 109591
## 2 24 89231 1 20360 5.476 0.02793 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmods <- lm(Species ~ Area+ offset(0.5*Elevation) + Nearest + Scruz + Adjacent, gala)
anova(lmods, lmod)
## Analysis of Variance Table
##
## Model 1: Species ~ Area + offset(0.5 * Elevation) + Nearest + Scruz +
## Adjacent
## Model 2: Species ~ Area + Elevation + Nearest + Scruz + Adjacent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 25 131312
## 2 24 89231 1 42081 11.318 0.002574 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(tstat <- (0.31946-0.5)/0.05366)
## [1] -3.364517
2*pt(tstat, 24)
## [1] 0.002572168
tstat^2
## [1] 11.31998
lmod <- lm(Species ~ Nearest + Scruz, gala)
lms <- summary(lmod)
lms$fstatistic
## value numdf dendf
## 0.6019558 2.0000000 27.0000000
1-pf(lms$fstatistic[1],lms$fstatistic[2],lms$fstatistic[3])
## value
## 0.5549255
nreps <- 4000
set.seed(123)
fstats <- numeric(nreps)
for(i in 1:nreps){
lmods <- lm(sample(Species) ~ Nearest+Scruz, gala)
fstats[i] <- summary(lmods)$fstat[1]
}
mean(fstats > lms$fstat[1])
## [1] 0.55825
summary(lmod)$coef[3,]
## Estimate Std. Error t value Pr(>|t|)
## -0.4406401 0.4025312 -1.0946731 0.2833295
tstats <- numeric(nreps)
set.seed(123)
for(i in 1:nreps){
lmods <- lm(Species ~ Nearest+sample(Scruz), gala)
tstats[i] <- summary(lmods)$coef[3,3]
}
mean(abs(tstats) > abs(lms$coef[3,3]))
## [1] 0.26825
lmod <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, gala)
sumary(lmod)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.068221 19.154198 0.3690 0.7153508
## Area -0.023938 0.022422 -1.0676 0.2963180
## Elevation 0.319465 0.053663 5.9532 3.823e-06
## Nearest 0.009144 1.054136 0.0087 0.9931506
## Scruz -0.240524 0.215402 -1.1166 0.2752082
## Adjacent -0.074805 0.017700 -4.2262 0.0002971
##
## n = 30, p = 6, Residual SE = 60.97519, R-Squared = 0.77
qt(0.975, 30-6)
## [1] 2.063899
-0.02394 + c(-1,1) * 2.0639 * 0.02242
## [1] -0.07021264 0.02233264
-0.07480 + c(-1,1) * 2.0639 * 0.01770
## [1] -0.11133103 -0.03826897
confint(lmod)
## 2.5 % 97.5 %
## (Intercept) -32.4641006 46.60054205
## Area -0.0702158 0.02233912
## Elevation 0.2087102 0.43021935
## Nearest -2.1664857 2.18477363
## Scruz -0.6850926 0.20404416
## Adjacent -0.1113362 -0.03827344
require(ellipse)
## Loading required package: ellipse
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
plot(ellipse(lmod,c(2,6)),type="l",ylim=c(-0.13,0))
points(coef(lmod)[2], coef(lmod)[6], pch=19)
abline(v=confint(lmod)[2,],lty=2)
abline(h=confint(lmod)[6,],lty=2)
sample(10,rep=TRUE)
## [1] 7 3 4 9 1 5 1 2 4 10
set.seed(123)
nb <- 4000
coefmat <- matrix(NA,nb,6)
resids <- residuals(lmod)
preds <- fitted(lmod)
for(i in 1:nb){
booty <- preds + sample(resids, rep=TRUE)
bmod <- update(lmod, booty ~ .)
coefmat[i,] <- coef(bmod)
}
colnames(coefmat) <- c("Intercept",colnames(gala[,3:7]))
coefmat <- data.frame(coefmat)
apply(coefmat,2,function(x) quantile(x,c(0.025,0.975)))
## Intercept Area Elevation Nearest Scruz Adjacent
## 2.5% -25.31406 -0.06236506 0.2310989 -1.716588 -0.6061978 -0.10545278
## 97.5% 42.69309 0.01807403 0.4207570 2.122722 0.1677720 -0.03979658
require(ggplot2)
## Loading required package: ggplot2
ggplot(coefmat, aes(x=Area)) + geom_density() + geom_vline(xintercept=c(-0.0628, 0.0185),lty=2)
ggplot(coefmat, aes(x=Adjacent)) + geom_density() + geom_vline(xintercept=c(-0.104, -0.0409),lty=2)
Chapter 4 (Prediction)
library(faraway)
data(fat,package="faraway")
lmod <- lm(brozek ~ age + weight + height + neck + chest + abdom + hip + thigh + knee + ankle + biceps + forearm + wrist, data=fat)
x <- model.matrix(lmod)
(x0 <- apply(x,2,median))
## (Intercept) age weight height neck chest
## 1.00 43.00 176.50 70.00 38.00 99.65
## abdom hip thigh knee ankle biceps
## 90.95 99.30 59.00 38.50 22.80 32.05
## forearm wrist
## 28.70 18.30
(y0 <- sum(x0*coef(lmod)))
## [1] 17.49322
predict(lmod,new=data.frame(t(x0)))
## 1
## 17.49322
predict(lmod,new=data.frame(t(x0)),interval="prediction")
## fit lwr upr
## 1 17.49322 9.61783 25.36861
predict(lmod,new=data.frame(t(x0)),interval="confidence")
## fit lwr upr
## 1 17.49322 16.94426 18.04219
(x1 <- apply(x,2,function(x) quantile(x,0.95)))
## (Intercept) age weight height neck chest
## 1.000 67.000 225.650 74.500 41.845 116.340
## abdom hip thigh knee ankle biceps
## 110.760 112.125 68.545 42.645 25.445 37.200
## forearm wrist
## 31.745 19.800
predict(lmod, new=data.frame(t(x1)), interval="prediction")
## fit lwr upr
## 1 30.01804 21.92407 38.11202
predict(lmod, new=data.frame(t(x1)), interval="confidence")
## fit lwr upr
## 1 30.01804 28.07072 31.96537
data(airpass, package="faraway")
plot(pass ~ year, airpass, type="l", ylab="Log(Passengers)")
lmod <- lm(log(pass) ~ year, airpass)
lines(exp(predict(lmod)) ~ year, airpass)
lagdf <- embed(log(airpass$pass),14)
colnames(lagdf) <- c("y",paste0("lag",1:13))
lagdf <- data.frame(lagdf)
armod <- lm(y ~ lag1 + lag12 + lag13, data.frame(lagdf))
sumary(armod)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.138485 0.053607 2.5833 0.01092
## lag1 0.692308 0.061858 11.1919 < 2.2e-16
## lag12 0.921518 0.034732 26.5321 < 2.2e-16
## lag13 -0.632144 0.067679 -9.3403 4.156e-16
##
## n = 131, p = 4, Residual SE = 0.04164, R-Squared = 0.99
plot(pass ~ year, airpass, type="l")
lines(airpass$year[14:144], exp(predict(armod)), lty=2)
lagdf[nrow(lagdf),]
## y lag1 lag2 lag3 lag4 lag5 lag6 lag7
## 131 6.068426 5.966147 6.133398 6.230481 6.40688 6.43294 6.282267 6.156979
## lag8 lag9 lag10 lag11 lag12 lag13
## 131 6.133398 6.037871 5.968708 6.033086 6.003887 5.891644
predict(armod, data.frame(lag1=6.0684, lag12=6.0331, lag13=6.0039),interval="prediction")
## fit lwr upr
## 1 6.103972 6.020606 6.187338
Chapter 5 (Explanation)
library(faraway)
data(gala, package="faraway")
lmod <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, gala)
sumary(lmod)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.068221 19.154198 0.3690 0.7153508
## Area -0.023938 0.022422 -1.0676 0.2963180
## Elevation 0.319465 0.053663 5.9532 3.823e-06
## Nearest 0.009144 1.054136 0.0087 0.9931506
## Scruz -0.240524 0.215402 -1.1166 0.2752082
## Adjacent -0.074805 0.017700 -4.2262 0.0002971
##
## n = 30, p = 6, Residual SE = 60.97519, R-Squared = 0.77
sumary(lm(Species ~ Elevation, gala))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.335113 19.205288 0.5902 0.5598
## Elevation 0.200792 0.034646 5.7955 3.177e-06
##
## n = 30, p = 2, Residual SE = 78.66154, R-Squared = 0.55
plot(Species ~ Elevation, gala)
abline(11.3,0.201)
colMeans(gala)
## Species Endemics Area Elevation Nearest Scruz Adjacent
## 85.23333 26.10000 261.70867 368.03333 10.06000 56.97667 261.09833
p <- predict(lmod, data.frame(Area=261.7, Elevation=gala$Elevation, Nearest=10.06, Scruz = 56.98, Adjacent=261.1))
i <- order(gala$Elevation)
lines(gala$Elevation[i], p[i], lty=2)
data(newhamp, package="faraway")
colSums(newhamp[newhamp$votesys == 'D',2:3])
## Obama Clinton
## 86353 96890
colSums(newhamp[newhamp$votesys == 'H',2:3])
## Obama Clinton
## 16926 14471
newhamp$trt <- ifelse(newhamp$votesys == 'H',1,0)
lmodu <- lm(pObama ~ trt, newhamp)
sumary(lmodu)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3525171 0.0051728 68.1480 < 2.2e-16
## trt 0.0424871 0.0085091 4.9932 1.059e-06
##
## n = 276, p = 2, Residual SE = 0.06823, R-Squared = 0.08
lmodz <- lm(pObama ~ trt + Dean , newhamp)
sumary(lmodz)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2211192 0.0112502 19.6547 <2e-16
## trt -0.0047538 0.0077608 -0.6125 0.5407
## Dean 0.5228967 0.0416500 12.5545 <2e-16
##
## n = 276, p = 3, Residual SE = 0.05443, R-Squared = 0.42
sumary(lm(Dean ~ trt, newhamp))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2512886 0.0059850 41.9861 < 2.2e-16
## trt 0.0903446 0.0098451 9.1766 < 2.2e-16
##
## n = 276, p = 2, Residual SE = 0.07895, R-Squared = 0.24
require(Matching)
## Loading required package: Matching
## Loading required package: MASS
## ##
## ## Matching (Version 4.9-7, Build Date: 2020-02-05)
## ## See http://sekhon.berkeley.edu/matching for additional documentation.
## ## Please cite software as:
## ## Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching
## ## Software with Automated Balance Optimization: The Matching package for R.''
## ## Journal of Statistical Software, 42(7): 1-52.
## ##
set.seed(123)
mm <- GenMatch(newhamp$trt, newhamp$Dean, ties=FALSE, caliper=0.05, pop.size=1000)
## Loading required namespace: rgenoud
##
##
## Sun Nov 01 09:00:45 2020
## Domains:
## 0.000000e+00 <= X1 <= 1.000000e+03
##
## Data Type: Floating Point
## Operators (code number, name, population)
## (1) Cloning........................... 122
## (2) Uniform Mutation.................. 125
## (3) Boundary Mutation................. 125
## (4) Non-Uniform Mutation.............. 125
## (5) Polytope Crossover................ 125
## (6) Simple Crossover.................. 126
## (7) Whole Non-Uniform Mutation........ 125
## (8) Heuristic Crossover............... 126
## (9) Local-Minimum Crossover........... 0
##
## SOFT Maximum Number of Generations: 100
## Maximum Nonchanging Generations: 4
## Population size : 1000
## Convergence Tolerance: 1.000000e-03
##
## Not Using the BFGS Derivative Based Optimizer on the Best Individual Each Generation.
## Not Checking Gradients before Stopping.
## Using Out of Bounds Individuals.
##
## Maximization Problem.
## GENERATION: 0 (initializing the population)
## Lexical Fit..... 2.127495e-01 9.987660e-01
## #unique......... 1000, #Total UniqueCount: 1000
## var 1:
## best............ 1.000000e+00
## mean............ 5.006683e+02
## variance........ 8.388390e+04
##
## GENERATION: 1
## Lexical Fit..... 6.264505e-01 9.987660e-01
## #unique......... 604, #Total UniqueCount: 1604
## var 1:
## best............ 2.255547e-02
## mean............ 2.280858e+02
## variance........ 7.904885e+04
##
## GENERATION: 2
## Lexical Fit..... 9.910633e-01 9.987660e-01
## #unique......... 602, #Total UniqueCount: 2206
## var 1:
## best............ 2.702493e-02
## mean............ 7.389477e+01
## variance........ 3.118565e+04
##
## GENERATION: 3
## Lexical Fit..... 9.910633e-01 9.987660e-01
## #unique......... 604, #Total UniqueCount: 2810
## var 1:
## best............ 2.702493e-02
## mean............ 7.042592e+01
## variance........ 2.858671e+04
##
## GENERATION: 4
## Lexical Fit..... 9.910633e-01 9.987660e-01
## #unique......... 415, #Total UniqueCount: 3225
## var 1:
## best............ 2.702493e-02
## mean............ 6.569943e+01
## variance........ 2.806356e+04
##
## GENERATION: 5
## Lexical Fit..... 9.910633e-01 9.987660e-01
## #unique......... 425, #Total UniqueCount: 3650
## var 1:
## best............ 2.702493e-02
## mean............ 8.035088e+01
## variance........ 3.395814e+04
##
## GENERATION: 6
## Lexical Fit..... 9.947238e-01 9.987660e-01
## #unique......... 405, #Total UniqueCount: 4055
## var 1:
## best............ 2.604222e-03
## mean............ 6.399581e+01
## variance........ 2.788729e+04
##
## GENERATION: 7
## Lexical Fit..... 9.970770e-01 9.987660e-01
## #unique......... 595, #Total UniqueCount: 4650
## var 1:
## best............ 2.408582e-03
## mean............ 7.120129e+01
## variance........ 3.239014e+04
##
## GENERATION: 8
## Lexical Fit..... 9.970770e-01 9.987660e-01
## #unique......... 596, #Total UniqueCount: 5246
## var 1:
## best............ 2.408582e-03
## mean............ 7.760128e+01
## variance........ 3.490217e+04
##
## GENERATION: 9
## Lexical Fit..... 9.970770e-01 9.987660e-01
## #unique......... 403, #Total UniqueCount: 5649
## var 1:
## best............ 2.408582e-03
## mean............ 6.762662e+01
## variance........ 2.926614e+04
##
## GENERATION: 10
## Lexical Fit..... 9.970770e-01 9.987660e-01
## #unique......... 429, #Total UniqueCount: 6078
## var 1:
## best............ 2.408582e-03
## mean............ 7.201190e+01
## variance........ 2.786938e+04
##
## GENERATION: 11
## Lexical Fit..... 9.987660e-01 9.990051e-01
## #unique......... 423, #Total UniqueCount: 6501
## var 1:
## best............ 2.336515e-03
## mean............ 6.935256e+01
## variance........ 3.131574e+04
##
## GENERATION: 12
## Lexical Fit..... 9.987660e-01 9.990882e-01
## #unique......... 612, #Total UniqueCount: 7113
## var 1:
## best............ 2.364199e-03
## mean............ 6.593385e+01
## variance........ 2.785792e+04
##
## GENERATION: 13
## Lexical Fit..... 9.987660e-01 9.990882e-01
## #unique......... 592, #Total UniqueCount: 7705
## var 1:
## best............ 2.364199e-03
## mean............ 6.576213e+01
## variance........ 2.685578e+04
##
## GENERATION: 14
## Lexical Fit..... 9.987660e-01 9.990882e-01
## #unique......... 417, #Total UniqueCount: 8122
## var 1:
## best............ 2.364199e-03
## mean............ 6.042472e+01
## variance........ 2.583306e+04
##
## GENERATION: 15
## Lexical Fit..... 9.987660e-01 9.990882e-01
## #unique......... 423, #Total UniqueCount: 8545
## var 1:
## best............ 2.364199e-03
## mean............ 6.067741e+01
## variance........ 2.497988e+04
##
## GENERATION: 16
## Lexical Fit..... 9.987660e-01 9.993674e-01
## #unique......... 437, #Total UniqueCount: 8982
## var 1:
## best............ 7.518945e-04
## mean............ 6.819509e+01
## variance........ 2.980902e+04
##
## 'wait.generations' limit reached.
## No significant improvement in 4 generations.
##
## Solution Lexical Fitness Value:
## 9.987660e-01 9.993674e-01
##
## Parameters at the Solution:
##
## X[ 1] : 7.518945e-04
##
## Solution Found Generation 11
## Number of Generations Run 16
##
## Sun Nov 01 09:01:06 2020
## Total run time : 0 hours 0 minutes and 21 seconds
head(mm$matches[,1:2])
## [,1] [,2]
## [1,] 4 213
## [2,] 17 148
## [3,] 18 6
## [4,] 19 83
## [5,] 21 246
## [6,] 22 185
newhamp[c(4,218),c('Dean','pObama','trt')]
## Dean pObama trt
## CenterHarbor 0.28495 0.3432836 1
## Northwood 0.28264 0.3368757 0
plot(pObama ~ Dean, newhamp, pch=trt+1)
with(newhamp,segments(Dean[mm$match[,1]],pObama[mm$match[,1]],Dean[mm$match[,2]],pObama[mm$match[,2]]))
pdiff <- newhamp$pObama[mm$matches[,1]] - newhamp$pObama[mm$matches[,2]]
t.test(pdiff)
##
## One Sample t-test
##
## data: pdiff
## t = -1.8798, df = 86, p-value = 0.06352
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.0328560272 0.0009183504
## sample estimates:
## mean of x
## -0.01596884
plot(pdiff ~ newhamp$Dean[mm$matches[,1]], xlab="Dean", ylab="Hand-Digital")
abline(h=0)
plot(pObama ~ Dean, newhamp, pch=trt+1)
abline(h=c(.353, 0.353+0.042),lty=1:2)
abline(0.221, 0.5229)
abline(0.221-0.005, 0.5229, lty=2)
with(newhamp,segments(Dean[mm$match[,1]],pObama[mm$match[,1]],Dean[mm$match[,2]],pObama[mm$match[,2]],col=gray(0.75)))
References
Faraway, J. J. (2004). Linear Models with R. In Linear Models with R. https://doi.org/10.4324/9780203507278
Faraway, J. (2009). Texts in Statistical Science: Linear Models with R. In Taylor and Francis Group.
Faraway, J. (2014). Texts in Statistical Science: Linear Models with R. Chapman & Hall/CRC Press. 274 pages, ISBN-13: 9781439887332.