This document summarizes key concepts in predictive modeling and linear regression from the book "Code for QSS Chapter 4: Prediction". It includes examples of using loops and conditional statements in R to make predictions, perform linear regression on facial competence scores and election outcomes, examine regression towards the mean, and merge multiple datasets. Poll data from 2008 is used to predict election results and compare to actual outcomes, finding an average error of 1.06 percentage points.
1 of 23
Download to read offline
More Related Content
Prediction
1. Code for QSS Chapter 4: Prediction
Kosuke Imai
First Printing
Section 4.1: Predicting Election Outcomes
Section 4.1.1: Loops in R
values <- c(2, 4, 6)
n <- length(values) # number of elements in `values'
results <- rep(NA, n) # empty container vector for storing the results
## loop counter `i' will take values on 1, 2, ..., n in that order
for (i in 1:n) {
## store the result of multiplication as the ith element of
## `results' vector
results[i] <- values[i] * 2
cat(values[i], "times 2 is equal to", results[i], "n")
}
## 2 times 2 is equal to 4
## 4 times 2 is equal to 8
## 6 times 2 is equal to 12
results
## [1] 4 8 12
## check if the code runs when i = 1
i <- 1
x <- values[i] * 2
cat(values[i], "times 2 is equal to", x, "n")
## 2 times 2 is equal to 4
Section 4.1.2: General Conditional Statements in R
## define the operation to be executed
operation <- "add"
if (operation == "add") {
cat("I will perform addition 4 + 4n")
4 + 4
}
## I will perform addition 4 + 4
## [1] 8
if (operation == "multiply") {
cat("I will perform multiplication 4 * 4n")
4 * 4
}
1
2. ## Note that `operation' is redefined
operation <- "multiply"
if (operation == "add") {
cat("I will perform addition 4 + 4")
4 + 4
} else {
cat("I will perform multiplication 4 * 4")
4 * 4
}
## I will perform multiplication 4 * 4
## [1] 16
## Note that `operation' is redefined
operation <- "subtract"
if (operation == "add") {
cat("I will perform addition 4 + 4n")
4 + 4
} else if (operation == "multiply") {
cat("I will perform multiplication 4 * 4n")
4 * 4
} else {
cat("`", operation, "' is invalid. Use either `add' or `multiply'.n",
sep = "")
}
## `subtract' is invalid. Use either `add' or `multiply'.
values <- 1:5
n <- length(values)
results <- rep(NA, n)
for (i in 1:n) {
## x and r get overwritten in each iteration
x <- values[i]
r <- x %% 2 # remainder when divided by 2 to check whether even or odd
if (r == 0) { # remainder is zero
cat(x, "is even and I will perform addition",
x, "+", x, "n")
results[i] <- x + x
} else { # remainder is not zero
cat(x, "is odd and I will perform multiplication",
x, "*", x, "n")
results[i] <- x * x
}
}
## 1 is odd and I will perform multiplication 1 * 1
## 2 is even and I will perform addition 2 + 2
## 3 is odd and I will perform multiplication 3 * 3
## 4 is even and I will perform addition 4 + 4
## 5 is odd and I will perform multiplication 5 * 5
results
## [1] 1 4 9 8 25
2
3. Section 4.1.3: Poll Predictions
## load election results, by state
pres08 <- read.csv("pres08.csv")
## load polling data
polls08 <- read.csv("polls08.csv")
## compute Obama's margin
polls08$margin <- polls08$Obama - polls08$McCain
pres08$margin <- pres08$Obama - pres08$McCain
x <- as.Date("2008-11-04")
y <- as.Date("2008/9/1")
x - y # number of days between 2008/9/1 and 11/4
## Time difference of 64 days
## convert to a Date object
polls08$middate <- as.Date(polls08$middate)
## computer the number of days to the election day
polls08$DaysToElection <- as.Date("2008-11-04") - polls08$middate
poll.pred <- rep(NA, 51) # initialize a vector place holder
## extract unique state names which the loop will iterate through
st.names <- unique(polls08$state)
## add state names as labels for easy interpretation later on
names(poll.pred) <- as.character(st.names)
## loop across 50 states plus DC
for (i in 1:51){
## subset the ith state
state.data <- subset(polls08, subset = (state == st.names[i]))
## further subset the latest polls within the state
latest <- subset(state.data, DaysToElection == min(DaysToElection))
## compute the mean of latest polls and store it
poll.pred[i] <- mean(latest$margin)
}
## error of latest polls
errors <- pres08$margin - poll.pred
names(errors) <- st.names # add state names
mean(errors) # mean prediction error
## [1] 1.062092
sqrt(mean(errors^2))
## [1] 5.90894
## histogram
hist(errors, freq = FALSE, ylim = c(0, 0.08),
main = "Poll prediction error",
xlab = "Error in predicted margin for Obama (percentage points)")
## add mean
abline(v = mean(errors), lty = "dashed", col = "red")
3
4. text(x = -7, y = 0.07, "average error", col = "red")
Poll prediction error
Error in predicted margin for Obama (percentage points)
Density
−20 −10 0 10 20
0.00
0.02
0.04
0.06
0.08
average error
## type = "n" generates "empty" plot
plot(poll.pred, pres08$margin, type = "n", main = "", xlab = "Poll results",
xlim = c(-40, 90), ylim = c(-40, 90), ylab = "Actual election results")
## add state abbreviations
text(x = poll.pred, y = pres08$margin, labels = pres08$state, col = "blue")
## lines
abline(a = 0, b = 1, lty = "dashed") # 45 degree line
abline(v = 0) # vertical line at 0
abline(h = 0) # horizontal line at 0
4
5. −40 −20 0 20 40 60 80
−40
0
20
40
60
80
Poll results
Actual
election
results
ALAK
AZ
AR
CA
CO
CT
DC
DE
FL
GA
HI
ID
IL
IN
IA
KS
KY LA
ME
MD
MA
MI
MN
MS
MO
MT
NE
NV
NH
NJ
NM
NY
NC
ND
OH
OK
OR
PA
RI
SC
SD
TNTX
UT
VT
VA
WA
WV
WI
WY
## which state polls called wrong?
pres08$state[sign(poll.pred) != sign(pres08$margin)]
## [1] IN MO NC
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA ... WY
## what was the actual margin for these states?
pres08$margin[sign(poll.pred) != sign(pres08$margin)]
## [1] 1 -1 1
## actual results: total number of electoral votes won by Obama
sum(pres08$EV[pres08$margin > 0])
## [1] 364
## poll prediction
sum(pres08$EV[poll.pred > 0])
## [1] 349
## load the data
pollsUS08 <- read.csv("pollsUS08.csv")
## compute number of days to the election as before
pollsUS08$middate <- as.Date(pollsUS08$middate)
pollsUS08$DaysToElection <- as.Date("2008-11-04") - pollsUS08$middate
## empty vectors to store predictions
Obama.pred <- McCain.pred <- rep(NA, 90)
for (i in 1:90) {
## take all polls conducted within the past 7 days
week.data <- subset(pollsUS08, subset = ((DaysToElection <= (90 - i + 7))
& (DaysToElection > (90 - i))))
5
6. ## compute support for each candidate using the average
Obama.pred[i] <- mean(week.data$Obama)
McCain.pred[i] <- mean(week.data$McCain)
}
## plot going from 90 days to 1 day before the election
plot(90:1, Obama.pred, type = "b", xlim = c(90, 0), ylim = c(40, 60),
col = "blue", xlab = "Days to the election",
ylab = "Support for candidate (percentage points)")
## `type = "b"' gives plot that includes both points and lines
lines(90:1, McCain.pred, type = "b", col = "red")
## actual election results: pch = 19 gives solid circles
points(0, 52.93, pch = 19, col = "blue")
points(0, 45.65, pch = 19, col = "red")
## line indicating the election day
abline(v = 0)
## labeling candidates
text(80, 48, "Obama", col = "blue")
text(80, 41, "McCain", col = "red")
80 60 40 20 0
40
45
50
55
60
Days to the election
Support
for
candidate
(percentage
points)
Obama
McCain
Section 4.2: Linear Regression
Section 4.2.1: Facial Appearance and Election Outcomes
6
7. ## load the data
face <- read.csv("face.csv")
## two-party vote share for Democrats and Republicans
face$d.share <- face$d.votes / (face$d.votes + face$r.votes)
face$r.share <- face$r.votes / (face$d.votes + face$r.votes)
face$diff.share <- face$d.share - face$r.share
plot(face$d.comp, face$diff.share, pch = 16,
col = ifelse(face$w.party == "R", "red", "blue"),
xlim = c(0, 1), ylim = c(-1, 1),
xlab = "Competence scores for Democrats",
ylab = "Democratic margin in vote share",
main = "Facial competence and vote share")
0.0 0.2 0.4 0.6 0.8 1.0
−1.0
−0.5
0.0
0.5
1.0
Facial competence and vote share
Competence scores for Democrats
Democratic
margin
in
vote
share
Section 4.2.2: Correlation and Scatter Plots
cor(face$d.comp, face$diff.share)
## [1] 0.4327743
Section 4.2.3: Least Squares
fit <- lm(diff.share ~ d.comp, data = face) # fit the model
fit
##
## Call:
7