�ݺ�ߣ

Code for QSS Chapter 4: Prediction
Kosuke Imai
First Printing
Section 4.1: Predicting Election Outcomes
Section 4.1.1: Loops in R
values <- c(2, 4, 6)
n <- length(values) # number of elements in `values'
results <- rep(NA, n) # empty container vector for storing the results
## loop counter `i' will take values on 1, 2, ..., n in that order
for (i in 1:n) {
## store the result of multiplication as the ith element of
## `results' vector
results[i] <- values[i] * 2
cat(values[i], "times 2 is equal to", results[i], "n")
}
## 2 times 2 is equal to 4
results
## [1] 4 8 12
## check if the code runs when i = 1
i <- 1
x <- values[i] * 2
cat(values[i], "times 2 is equal to", x, "n")
Section 4.1.2: General Conditional Statements in R
## define the operation to be executed
operation <- "add"
if (operation == "add") {
cat("I will perform addition 4 + 4n")
4 + 4
}
## I will perform addition 4 + 4
## [1] 8
if (operation == "multiply") {
cat("I will perform multiplication 4 * 4n")
4 * 4
}
1

## Note that òperation' is redefined
operation <- "multiply"
cat("I will perform addition 4 + 4")
4 + 4
} else {
cat("I will perform multiplication 4 * 4")
4 * 4
}
## I will perform multiplication 4 * 4
## [1] 16
## Note that òperation' is redefined
operation <- "subtract"
cat("I will perform addition 4 + 4n")
4 + 4
} else if (operation == "multiply") {
cat("I will perform multiplication 4 * 4n")
4 * 4
} else {
cat("`", operation, "' is invalid. Use either àdd' or `multiply'.n",
sep = "")
}
## `subtract' is invalid. Use either àdd' or `multiply'.
values <- 1:5
n <- length(values)
results <- rep(NA, n)
for (i in 1:n) {
## x and r get overwritten in each iteration
x <- values[i]
r <- x %% 2 # remainder when divided by 2 to check whether even or odd
if (r == 0) { # remainder is zero
cat(x, "is even and I will perform addition",
x, "+", x, "n")
results[i] <- x + x
} else { # remainder is not zero
cat(x, "is odd and I will perform multiplication",
x, "*", x, "n")
results[i] <- x * x
}
}
## 1 is odd and I will perform multiplication 1 * 1
## 2 is even and I will perform addition 2 + 2
## 4 is even and I will perform addition 4 + 4
results
## [1] 1 4 9 8 25
2

Section 4.1.3: Poll Predictions
## load election results, by state
pres08 <- read.csv("pres08.csv")
## load polling data
polls08 <- read.csv("polls08.csv")
## compute Obama's margin
polls08$margin <- polls08$Obama - polls08$McCain
pres08$margin <- pres08$Obama - pres08$McCain
x <- as.Date("2008-11-04")
y <- as.Date("2008/9/1")
x - y # number of days between 2008/9/1 and 11/4
## Time difference of 64 days
## convert to a Date object
polls08$middate <- as.Date(polls08$middate)
## computer the number of days to the election day
polls08$DaysToElection <- as.Date("2008-11-04") - polls08$middate
poll.pred <- rep(NA, 51) # initialize a vector place holder
## extract unique state names which the loop will iterate through
st.names <- unique(polls08$state)
## add state names as labels for easy interpretation later on
names(poll.pred) <- as.character(st.names)
## loop across 50 states plus DC
for (i in 1:51){
## subset the ith state
state.data <- subset(polls08, subset = (state == st.names[i]))
## further subset the latest polls within the state
latest <- subset(state.data, DaysToElection == min(DaysToElection))
## compute the mean of latest polls and store it
poll.pred[i] <- mean(latest$margin)
}
## error of latest polls
errors <- pres08$margin - poll.pred
names(errors) <- st.names # add state names
mean(errors) # mean prediction error
## [1] 1.062092
sqrt(mean(errors^2))
## [1] 5.90894
## histogram
hist(errors, freq = FALSE, ylim = c(0, 0.08),
main = "Poll prediction error",
xlab = "Error in predicted margin for Obama (percentage points)")
## add mean
abline(v = mean(errors), lty = "dashed", col = "red")
3

text(x = -7, y = 0.07, "average error", col = "red")
Poll prediction error
Error in predicted margin for Obama (percentage points)
Density
−20 −10 0 10 20
0.00
0.02
0.04
0.06
0.08
average error
## type = "n" generates "empty" plot
plot(poll.pred, pres08$margin, type = "n", main = "", xlab = "Poll results",
xlim = c(-40, 90), ylim = c(-40, 90), ylab = "Actual election results")
## add state abbreviations
text(x = poll.pred, y = pres08$margin, labels = pres08$state, col = "blue")
## lines
abline(a = 0, b = 1, lty = "dashed") # 45 degree line
abline(v = 0) # vertical line at 0
abline(h = 0) # horizontal line at 0
4

−40 −20 0 20 40 60 80
−40
0
20
40
60
80
Poll results
Actual
election
results
ALAK
AZ
AR
CA
CO
CT
DC
DE
FL
GA
HI
ID
IL
IN
IA
KS
KY LA
ME
MD
MA
MI
MN
MS
MO
MT
NE
NV
NH
NJ
NM
NY
NC
ND
OH
OK
OR
PA
RI
SC
SD
TNTX
UT
VT
VA
WA
WV
WI
WY
## which state polls called wrong?
pres08$state[sign(poll.pred) != sign(pres08$margin)]
## [1] IN MO NC
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA ... WY
## what was the actual margin for these states?
pres08$margin[sign(poll.pred) != sign(pres08$margin)]
## [1] 1 -1 1
## actual results: total number of electoral votes won by Obama
sum(pres08$EV[pres08$margin > 0])
## [1] 364
## poll prediction
sum(pres08$EV[poll.pred > 0])
## [1] 349
## load the data
pollsUS08 <- read.csv("pollsUS08.csv")
## compute number of days to the election as before
pollsUS08$middate <- as.Date(pollsUS08$middate)
pollsUS08$DaysToElection <- as.Date("2008-11-04") - pollsUS08$middate
## empty vectors to store predictions
Obama.pred <- McCain.pred <- rep(NA, 90)
for (i in 1:90) {
## take all polls conducted within the past 7 days
week.data <- subset(pollsUS08, subset = ((DaysToElection <= (90 - i + 7))
& (DaysToElection > (90 - i))))
5

## compute support for each candidate using the average
Obama.pred[i] <- mean(week.data$Obama)
McCain.pred[i] <- mean(week.data$McCain)
}
## plot going from 90 days to 1 day before the election
plot(90:1, Obama.pred, type = "b", xlim = c(90, 0), ylim = c(40, 60),
col = "blue", xlab = "Days to the election",
ylab = "Support for candidate (percentage points)")
## `type = "b"' gives plot that includes both points and lines
lines(90:1, McCain.pred, type = "b", col = "red")
## actual election results: pch = 19 gives solid circles
points(0, 52.93, pch = 19, col = "blue")
points(0, 45.65, pch = 19, col = "red")
## line indicating the election day
abline(v = 0)
## labeling candidates
text(80, 48, "Obama", col = "blue")
text(80, 41, "McCain", col = "red")
80 60 40 20 0
40
45
50
55
60
Days to the election
Support
for
candidate
(percentage
points)
Obama
McCain
Section 4.2: Linear Regression
Section 4.2.1: Facial Appearance and Election Outcomes
6

## load the data
face <- read.csv("face.csv")
## two-party vote share for Democrats and Republicans
face$d.share <- face$d.votes / (face$d.votes + face$r.votes)
face$r.share <- face$r.votes / (face$d.votes + face$r.votes)
face$diff.share <- face$d.share - face$r.share
plot(face$d.comp, face$diff.share, pch = 16,
col = ifelse(face$w.party == "R", "red", "blue"),
xlim = c(0, 1), ylim = c(-1, 1),
xlab = "Competence scores for Democrats",
ylab = "Democratic margin in vote share",
main = "Facial competence and vote share")
0.0 0.2 0.4 0.6 0.8 1.0
−1.0
−0.5
0.0
0.5
1.0
Facial competence and vote share
Competence scores for Democrats
Democratic
margin
in
vote
share
Section 4.2.2: Correlation and Scatter Plots
cor(face$d.comp, face$diff.share)
## [1] 0.4327743
Section 4.2.3: Least Squares
fit <- lm(diff.share ~ d.comp, data = face) # fit the model
fit
##
## Call:
7

## lm(formula = diff.share ~ d.comp, data = face)
##
## Coefficients:
## (Intercept) d.comp
## -0.3122 0.6604
## lm(face$diff.share ~ face$d.comp)
coef(fit) # get estimated coefficients
## (Intercept) d.comp
## -0.3122259 0.6603815
head(fitted(fit)) # get fitted or predicted values
## 1 2 3 4 5 6
## 0.06060411 -0.08643340 0.09217061 0.04539236 0.13698690 -0.10057206
plot(face$d.comp, face$diff.share, xlim = c(0, 1.05), ylim = c(-1,1),
xlab = "Competence scores for Democrats",
ylab = "Democratic margin in vote share",
main = "Facial competence and vote share")
abline(fit) # add regression line
abline(v = 0, lty = "dashed")
0.0 0.2 0.4 0.6 0.8 1.0
−1.0
−0.5
0.0
0.5
1.0
Facial competence and vote share
Competence scores for Democrats
Democratic
margin
in
vote
share
epsilon.hat <- resid(fit) # residuals
sqrt(mean(epsilon.hat^2)) # RMSE
## [1] 0.2642361
8

Section 4.2.4: Regression Towards the Mean
Section 4.2.5: Merging Data Sets in R
pres12 <- read.csv("pres12.csv") # load 2012 data
## quick look at two data sets
head(pres08)
## state.name state Obama McCain EV margin
## 1 Alabama AL 39 60 9 -21
## 2 Alaska AK 38 59 3 -21
## 3 Arizona AZ 45 54 10 -9
## 4 Arkansas AR 39 59 6 -20
## 5 California CA 61 37 55 24
## 6 Colorado CO 54 45 9 9
head(pres12)
## state Obama Romney EV
## 1 AL 38 61 9
## 2 AK 41 55 3
## 3 AZ 45 54 11
## 4 AR 37 61 6
## 5 CA 60 37 55
## 6 CO 51 46 9
## merge two data frames
pres <- merge(pres08, pres12, by = "state")
## summarize the merged data frame
summary(pres)
## state state.name Obama.x McCain
## AK : 1 Alabama : 1 Min. :33.00 Min. : 7.00
## AL : 1 Alaska : 1 1st Qu.:43.00 1st Qu.:40.00
## AR : 1 Arizona : 1 Median :51.00 Median :47.00
## AZ : 1 Arkansas : 1 Mean :51.37 Mean :47.06
## CA : 1 California: 1 3rd Qu.:57.50 3rd Qu.:56.00
## CO : 1 Colorado : 1 Max. :92.00 Max. :66.00
## (Other):45 (Other) :45
## EV.x margin Obama.y Romney
## Min. : 3.00 Min. :-32.000 Min. :25.00 Min. : 7.00
## 1st Qu.: 4.50 1st Qu.:-13.000 1st Qu.:40.50 1st Qu.:41.00
## Median : 8.00 Median : 4.000 Median :51.00 Median :48.00
## Mean :10.55 Mean : 4.314 Mean :49.06 Mean :49.04
## 3rd Qu.:11.50 3rd Qu.: 17.500 3rd Qu.:56.00 3rd Qu.:58.00
## Max. :55.00 Max. : 85.000 Max. :91.00 Max. :73.00
##
## EV.y
## Min. : 3.00
## 1st Qu.: 4.50
## Median : 8.00
## Mean :10.55
## 3rd Qu.:11.50
## Max. :55.00
9

##
## change the variable name for illustration
names(pres12)[1] <- "state.abb"
## merging data sets using the variables of different names
pres <- merge(pres08, pres12, by.x = "state", by.y = "state.abb")
summary(pres)
## state state.name Obama.x McCain
## AK : 1 Alabama : 1 Min. :33.00 Min. : 7.00
## AL : 1 Alaska : 1 1st Qu.:43.00 1st Qu.:40.00
## AR : 1 Arizona : 1 Median :51.00 Median :47.00
## AZ : 1 Arkansas : 1 Mean :51.37 Mean :47.06
## CA : 1 California: 1 3rd Qu.:57.50 3rd Qu.:56.00
## CO : 1 Colorado : 1 Max. :92.00 Max. :66.00
## (Other):45 (Other) :45
## EV.x margin Obama.y Romney
## Min. : 3.00 Min. :-32.000 Min. :25.00 Min. : 7.00
## 1st Qu.: 4.50 1st Qu.:-13.000 1st Qu.:40.50 1st Qu.:41.00
## Median : 8.00 Median : 4.000 Median :51.00 Median :48.00
## Mean :10.55 Mean : 4.314 Mean :49.06 Mean :49.04
## 3rd Qu.:11.50 3rd Qu.: 17.500 3rd Qu.:56.00 3rd Qu.:58.00
## Max. :55.00 Max. : 85.000 Max. :91.00 Max. :73.00
##
## EV.y
## Min. : 3.00
## 1st Qu.: 4.50
## Median : 8.00
## Mean :10.55
## 3rd Qu.:11.50
## Max. :55.00
##
## cbinding two data frames
pres1 <- cbind(pres08, pres12)
## this shows all variables are kept
summary(pres1)
## state.name state Obama McCain
## Alabama : 1 AK : 1 Min. :33.00 Min. : 7.00
## Alaska : 1 AL : 1 1st Qu.:43.00 1st Qu.:40.00
## Arizona : 1 AR : 1 Median :51.00 Median :47.00
## Arkansas : 1 AZ : 1 Mean :51.37 Mean :47.06
## California: 1 CA : 1 3rd Qu.:57.50 3rd Qu.:56.00
## Colorado : 1 CO : 1 Max. :92.00 Max. :66.00
## (Other) :45 (Other):45
## EV margin state.abb Obama
## Min. : 3.00 Min. :-32.000 AK : 1 Min. :25.00
## 1st Qu.: 4.50 1st Qu.:-13.000 AL : 1 1st Qu.:40.50
## Median : 8.00 Median : 4.000 AR : 1 Median :51.00
## Mean :10.55 Mean : 4.314 AZ : 1 Mean :49.06
## 3rd Qu.:11.50 3rd Qu.: 17.500 CA : 1 3rd Qu.:56.00
## Max. :55.00 Max. : 85.000 CO : 1 Max. :91.00
## (Other):45
## Romney EV
10

## Min. : 7.00 Min. : 3.00
## 1st Qu.:41.00 1st Qu.: 4.50
## Median :48.00 Median : 8.00
## Mean :49.04 Mean :10.55
## 3rd Qu.:58.00 3rd Qu.:11.50
## Max. :73.00 Max. :55.00
##
## DC and DE are flipped in this alternative approach
pres1[8:9, ]
## state.name state Obama McCain EV margin state.abb Obama Romney EV
## 8 D.C. DC 92 7 3 85 DE 59 40 3
## 9 Delaware DE 62 37 3 25 DC 91 7 3
## merge() does not have this problem
pres[8:9, ]
## state state.name Obama.x McCain EV.x margin Obama.y Romney EV.y
## 8 DC D.C. 92 7 3 85 91 7 3
## 9 DE Delaware 62 37 3 25 59 40 3
pres$Obama2008.z <- scale(pres$Obama.x)
pres$Obama2012.z <- scale(pres$Obama.y)
## intercept is estimated essentially zero
fit1 <- lm(Obama2012.z ~ Obama2008.z, data = pres)
fit1
##
## Call:
## lm(formula = Obama2012.z ~ Obama2008.z, data = pres)
##
## Coefficients:
## (Intercept) Obama2008.z
## -3.521e-17 9.834e-01
## regression without an intercept; estimated slope is identical
fit1 <- lm(Obama2012.z ~ -1 + Obama2008.z, data = pres)
fit1
##
## Call:
## lm(formula = Obama2012.z ~ -1 + Obama2008.z, data = pres)
##
## Coefficients:
## Obama2008.z
## 0.9834
plot(pres$Obama2008.z, pres$Obama2012.z, xlim = c(-4, 4), ylim = c(-4, 4),
xlab = "Obama's standardized vote share in 2008",
ylab = "Obama's standardized vote share in 2012")
abline(fit1) # draw a regression line
11

−4 −2 0 2 4
−4
−2
0
2
4
Obama's standardized vote share in 2008
Obama's
standardized
vote
share
in
2012
## bottom quartile
mean((pres$Obama2012.z >
pres$Obama2008.z)[pres$Obama2008.z
<= quantile(pres$Obama2008.z, 0.25)])
## [1] 0.5714286
## top quartile
mean((pres$Obama2012.z >
pres$Obama2008.z)[pres$Obama2008.z
>= quantile(pres$Obama2008.z, 0.75)])
## [1] 0.4615385
Section 4.2.6: Model Fit
florida <- read.csv("florida.csv")
## regress Buchanan's 2000 votes on Perot's 1996 votes
fit2 <- lm(Buchanan00 ~ Perot96, data = florida)
fit2
##
## Call:
## lm(formula = Buchanan00 ~ Perot96, data = florida)
##
## Coefficients:
## (Intercept) Perot96
## 1.34575 0.03592
## compute TSS (total sum of squares) and SSR (sum of squared residuals)
TSS2 <- sum((florida$Buchanan00 - mean(florida$Buchanan00))^2)
12

SSR2 <- sum(resid(fit2)^2)
## Coefficient of determination
(TSS2 - SSR2) / TSS2
## [1] 0.5130333
R2 <- function(fit) {
resid <- resid(fit) # residuals
y <- fitted(fit) + resid # outcome variable
TSS <- sum((y - mean(y))^2)
SSR <- sum(resid^2)
R2 <- (TSS - SSR) / TSS
return(R2)
}
R2(fit2)
## [1] 0.5130333
## built-in R function
summary(fit2)$r.squared
## [1] 0.5130333
R2(fit1)
## [1] 0.9671579
plot(fitted(fit2), resid(fit2), xlim = c(0, 1500), ylim = c(-750, 2500),
xlab = "Fitted values", ylab = "Residuals")
abline(h = 0)
0 500 1000 1500
−500
0
500
1500
2500
Fitted values
Residuals
florida$county[resid(fit2) == max(resid(fit2))]
## [1] PalmBeach
13

## 67 Levels: Alachua Baker Bay Bradford Brevard Broward ... Washington
## data without Palm Beach
florida.pb <- subset(florida, subset = (county != "PalmBeach"))
fit3 <- lm(Buchanan00 ~ Perot96, data = florida.pb)
fit3
##
## Call:
## lm(formula = Buchanan00 ~ Perot96, data = florida.pb)
##
## Coefficients:
## (Intercept) Perot96
## 45.84193 0.02435
## R^2 or coefficient of determination
R2(fit3)
## [1] 0.8511675
## residual plot
plot(fitted(fit3), resid(fit3), xlim = c(0, 1500), ylim = c(-750, 2500),
xlab = "Fitted values", ylab = "Residuals",
main = "Residual plot without Palm Beach")
abline(h = 0) # horizontal line at 0
0 500 1000 1500
−500
0
500
1500
2500
Residual plot without Palm Beach
Fitted values
Residuals
plot(florida$Perot96, florida$Buchanan00, xlab = "Perot's votes in 1996",
ylab = "Buchanan's votes in 2000")
abline(fit2, lty = "dashed") # regression with Palm Beach
abline(fit3) # regression without Palm Beach
text(30000, 3250, "Palm Beach")
14

text(30000, 1500, "regressionn with Palm Beach")
text(30000, 400, "regressionn without Palm Beach")
0 10000 20000 30000 40000
0
500
1500
2500
3500
Perot's votes in 1996
Buchanan's
votes
in
2000
Palm Beach
regression
with Palm Beach
regression
without Palm Beach
Section 4.3: Regression and Causation
Section 4.3.1: Randomized Experiments
women <- read.csv("women.csv")
## proportion of female politicians in reserved GP vs. unreserved GP
mean(women$female[women$reserved == 1])
## [1] 1
mean(women$female[women$reserved == 0])
## [1] 0.07476636
## drinking-water facilities
mean(women$water[women$reserved == 1]) -
mean(women$water[women$reserved == 0])
## [1] 9.252423
## irrigation facilities
mean(women$irrigation[women$reserved == 1]) -
mean(women$irrigation[women$reserved == 0])
## [1] -0.3693319
lm(water ~ reserved, data = women)
15

##
## Call:
## lm(formula = water ~ reserved, data = women)
##
## Coefficients:
## (Intercept) reserved
## 14.738 9.252
lm(irrigation ~ reserved, data = women)
##
## Call:
## lm(formula = irrigation ~ reserved, data = women)
##
## Coefficients:
## (Intercept) reserved
## 3.3879 -0.3693
Section 4.3.2: Regression with Multiple Predictors
social <- read.csv("social.csv")
levels(social$messages) # base level is `Civic'
## [1] "Civic Duty" "Control" "Hawthorne" "Neighbors"
fit <- lm(primary2008 ~ messages, data = social)
fit
##
## Call:
## lm(formula = primary2008 ~ messages, data = social)
##
## Coefficients:
## (Intercept) messagesControl messagesHawthorne
## 0.314538 -0.017899 0.007837
## messagesNeighbors
## 0.063411
## ## create indicator variables
## social$Control <- ifelse(social$messages == "Control", 1, 0)
## social$Hawthorne <- ifelse(social$messages == "Hawthorne", 1, 0)
## social$Neighbors <- ifelse(social$messages == "Neighbors", 1, 0)
## ## fit the same regression as above by directly using indicator variables
## lm(primary2008 ~ Control + Hawthorne + Neighbors, data = social)
## create a data frame with unique values of `messages'
unique.messages <- data.frame(messages = unique(social$messages))
unique.messages
## messages
## 1 Civic Duty
## 2 Hawthorne
## 3 Control
## 4 Neighbors
16

## make prediction for each observation from this new data frame
predict(fit, newdata = unique.messages)
## 1 2 3 4
## 0.3145377 0.3223746 0.2966383 0.3779482
## sample average
tapply(social$primary2008, social$messages, mean)
## Civic Duty Control Hawthorne Neighbors
## 0.3145377 0.2966383 0.3223746 0.3779482
## linear regression without intercept
fit.noint <- lm(primary2008 ~ -1 + messages, data = social)
fit.noint
##
## Call:
## lm(formula = primary2008 ~ -1 + messages, data = social)
##
## Coefficients:
## messagesCivic Duty messagesControl messagesHawthorne
## 0.3145 0.2966 0.3224
## 0.3779
## estimated average effect of `Neighbors' condition
coef(fit)["messagesNeighbors"] - coef(fit)["messagesControl"]
## 0.08130991
## difference in means
mean(social$primary2008[social$messages == "Neighbors"]) -
mean(social$primary2008[social$messages == "Control"])
## [1] 0.08130991
## adjusted Rsquare
adjR2 <- function(fit) {
resid <- resid(fit) # residuals
y <- fitted(fit) + resid # outcome
n <- length(y)
TSS.adj <- sum((y - mean(y))^2) / (n - 1)
SSR.adj <- sum(resid^2) / (n - length(coef(fit)))
R2.adj <- 1 - SSR.adj / TSS.adj
return(R2.adj)
}
adjR2(fit)
## [1] 0.003272788
R2(fit) # unadjusted Rsquare calculation
## [1] 0.003282564
summary(fit)$adj.r.squared
## [1] 0.003272788
17

Section 4.3.3: Heterogenous Treatment Effects
## average treatment effect (ate) among those who voted in 2004 primary
social.voter <- subset(social, primary2004 == 1)
ate.voter <-
mean(social.voter$primary2008[social.voter$messages == "Neighbors"]) -
mean(social.voter$primary2008[social.voter$messages == "Control"])
ate.voter
## [1] 0.09652525
## average effect among those who did not vote
social.nonvoter <- subset(social, primary2004 == 0)
ate.nonvoter <-
mean(social.nonvoter$primary2008[social.nonvoter$messages == "Neighbors"]) -
mean(social.nonvoter$primary2008[social.nonvoter$messages == "Control"])
ate.nonvoter
## [1] 0.06929617
## difference
ate.voter - ate.nonvoter
## [1] 0.02722908
## subset neighbors and control groups
social.neighbor <- subset(social, (messages == "Control") |
(messages == "Neighbors"))
## standard way to generate main and interaction effects
fit.int <- lm(primary2008 ~ primary2004 + messages + primary2004:messages,
data = social.neighbor)
fit.int
##
## Call:
## lm(formula = primary2008 ~ primary2004 + messages + primary2004:messages,
## data = social.neighbor)
##
## Coefficients:
## (Intercept) primary2004
## 0.23711 0.14870
## messagesNeighbors primary2004:messagesNeighbors
## 0.06930 0.02723
## lm(primary2008 ~ primary2004 * messages, data = social.neighbor)
social.neighbor$age <- 2008 - social.neighbor$yearofbirth
summary(social.neighbor$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.00 43.00 52.00 51.82 61.00 108.00
fit.age <- lm(primary2008 ~ age * messages, data = social.neighbor)
fit.age
18

##
## Call:
## lm(formula = primary2008 ~ age * messages, data = social.neighbor)
##
## Coefficients:
## (Intercept) age messagesNeighbors
## 0.0894768 0.0039982 0.0485728
## age:messagesNeighbors
## 0.0006283
## age = 25, 45, 65, 85 in Neighbors group
age.neighbor <- data.frame(age = seq(from = 25, to = 85, by = 20),
messages = "Neighbors")
## age = 25, 45, 65, 85 in Control group
age.control <- data.frame(age = seq(from = 25, to = 85, by = 20),
messages = "Control")
## average treatment effect for age = 25, 45, 65, 85
ate.age <- predict(fit.age, newdata = age.neighbor) -
predict(fit.age, newdata = age.control)
ate.age
## 1 2 3 4
## 0.06428051 0.07684667 0.08941283 0.10197899
fit.age2 <- lm(primary2008 ~ age + I(age^2) + messages + age:messages +
I(age^2):messages, data = social.neighbor)
fit.age2
##
## Call:
## lm(formula = primary2008 ~ age + I(age^2) + messages + age:messages +
## I(age^2):messages, data = social.neighbor)
##
## Coefficients:
## (Intercept) age
## -9.700e-02 1.172e-02
## I(age^2) messagesNeighbors
## -7.389e-05 -5.275e-02
## age:messagesNeighbors I(age^2):messagesNeighbors
## 4.804e-03 -3.961e-05
## predicted turnout rate under the ``Neighbors'' treatment condition
yT.hat <- predict(fit.age2,
newdata = data.frame(age = 25:85, messages = "Neighbors"))
## predicted turnout rate under the control condition
yC.hat <- predict(fit.age2,
newdata = data.frame(age = 25:85, messages = "Control"))
## plotting the predicted turnout rate under each condition
plot(x = 25:85, y = yT.hat, type = "l", xlim = c(20, 90), ylim = c(0, 0.5),
xlab = "Age", ylab = "Predicted turnout rate")
19

lines(x = 25:85, y = yC.hat, lty = "dashed")
text(40, 0.45, "Neighbors condition")
text(45, 0.15, "Control condition")
20 30 40 50 60 70 80 90
0.0
0.1
0.2
0.3
0.4
0.5
Age
Predicted
turnout
rate
Neighbors condition
Control condition
## plotting the average treatment effect as a function of age
plot(x = 25:85, y = yT.hat - yC.hat, type = "l", xlim = c(20, 90),
ylim = c(0, 0.1), xlab = "Age",
ylab = "Estimated average treatment effect")
20 30 40 50 60 70 80 90
0.00
0.02
0.04
0.06
0.08
0.10
Age
Estimated
average
treatment
effect
20

Section 4.3.4: Regression Discontinuity Design
## load the data and subset them into two parties
MPs <- read.csv("MPs.csv")
MPs.labour <- subset(MPs, subset = (party == "labour"))
MPs.tory <- subset(MPs, subset = (party == "tory"))
## two regressions for Labour: negative and positive margin
labour.fit1 <- lm(ln.net ~ margin,
data = MPs.labour[MPs.labour$margin < 0, ])
labour.fit2 <- lm(ln.net ~ margin,
data = MPs.labour[MPs.labour$margin > 0, ])
## two regressions for Tory: negative and positive margin
tory.fit1 <- lm(ln.net ~ margin, data = MPs.tory[MPs.tory$margin < 0, ])
tory.fit2 <- lm(ln.net ~ margin, data = MPs.tory[MPs.tory$margin > 0, ])
## Labour: range of predictions
y1l.range <- c(min(MPs.labour$margin), 0) # min to 0
y2l.range <- c(0, max(MPs.labour$margin)) # 0 to max
## prediction
y1.labour <- predict(labour.fit1, newdata = data.frame(margin = y1l.range))
y2.labour <- predict(labour.fit2, newdata = data.frame(margin = y2l.range))
## Tory: range of predictions
y1t.range <- c(min(MPs.tory$margin), 0) # min to 0
y2t.range <- c(0, max(MPs.tory$margin)) # 0 to max
## predict outcome
y1.tory <- predict(tory.fit1, newdata = data.frame(margin = y1t.range))
y2.tory <- predict(tory.fit2, newdata = data.frame(margin = y2t.range))
## scatterplot with regression lines for labour
plot(MPs.labour$margin, MPs.labour$ln.net, main = "Labour",
xlim = c(-0.5, 0.5), ylim = c(6, 18), xlab = "Margin of victory",
ylab = "log net wealth at death")
## add regression lines
lines(y1l.range, y1.labour, col = "red")
lines(y2l.range, y2.labour, col = "red")
21

−0.4 −0.2 0.0 0.2 0.4
6
8
10
12
14
16
18
Labour
Margin of victory
log
net
wealth
at
death
## scatterplot with regression lines for tory
plot(MPs.tory$margin, MPs.tory$ln.net, main = "Tory", xlim = c(-0.5, 0.5),
ylim = c(6, 18), xlab = "Margin of victory",
ylab = "log net wealth at death")
## add regression lines
lines(y1t.range, y1.tory, col = "red")
lines(y2t.range, y2.tory, col = "red")
22

−0.4 −0.2 0.0 0.2 0.4
6
8
10
12
14
16
18
Tory
Margin of victory
log
net
wealth
at
death
## average net wealth for Tory MP
tory.MP <- exp(y2.tory[1])
tory.MP
## 1
## 533813.5
## average net wealth for Tory non-MP
tory.nonMP <- exp(y1.tory[2])
tory.nonMP
## 2
## 278762.5
## causal effect in pounds
tory.MP - tory.nonMP
## 1
## 255050.9
## two regressions for Tory: negative and positive margin
tory.fit3 <- lm(margin.pre ~ margin, data = MPs.tory[MPs.tory$margin < 0, ])
tory.fit4 <- lm(margin.pre ~ margin, data = MPs.tory[MPs.tory$margin > 0, ])
## the difference between two intercepts is the estimated effect
coef(tory.fit4)[1] - coef(tory.fit3)[1]
## (Intercept)
## -0.01725578
23

�ݺ�ߣ

Prediction

More Related Content

Prediction