ºÝºÝߣ

ºÝºÝߣShare a Scribd company logo
ë‹¤ì¤‘íšŒê·€ë¶„ì„ Multi-Regression Analyse
â€œë¯¸êµ­ì˜ ì§‘ê°’ì— ì˜í–¥ì„ 미치는 ìš”ì¸ë“¤ì— 대한 분ì„â€
House Pricing
1. ë…ë¦½ë³€ìˆ˜ì— ëŒ€í•œ 설명
(1) ìžë£Œì— 대한 요약
i. 모든 변수들 ê°„ì˜ ìƒê´€ê´€ê³„
# date 변수를 numeric 으로 바꾸기
date_numeric <- substr(kc_house$date, 1, 8) # 20141124 형태로 ì—°ë„날짜 문ìžë¶€ë¶„만
추출
is.numeric(date_numeric) # FALSE
date_numeric <- as.numeric(date_numeric)
is.numeric(date_numeric) # TRUE
kc_house$date <- date_numeric # ê¸°ì¡´ì˜ date 를 새로운 numeric date ë¡œ 대체
View(kc_house)
house_cor <- cor(kc_house) # 모든 ë³€ìˆ˜ë“¤ì˜ ì„œë¡œì„œë¡œ ê°„ì˜ ìƒê´€ê´€ê³„를 계산
#
library(psych)
house_cortest <- psych::corr.test(kc_house)
DT::datatable(house_cortest)
View(house_cortest)
library(Hmisc)
Hmisc::rcorr(as.matrix(attitude))
round(house_cor, 2) # ì†Œìˆ«ì  ë‘˜ì§¸ìžë¦¬ê¹Œì§€ round
pairs(house_cor,
pch = 19,
bg = c("red", "green", "blue")) # 행렬모양 ì‚°ì ë„
corrplot(house_cor)
# ìƒê´€ì›ê³„수가 í´ìˆ˜ë¡ í¬ê¸°ê°€ í¬ê³  ìƒ‰ê¹”ì´ ì§„í•˜ë‹¤
# 양수면 파란색, ìŒìˆ˜ë©´ 붉ì€ìƒ‰
corrplot(house_cor, method = "number") # 수와 색깔로 표현
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD",
"#4477AA"))
corrplot(house_cor,
method = "color", # 색깔로 표현
col = col(200), # ìƒ‰ìƒ 200 ê°œ ì„ ì •
type = "lower", # 왼쪽 아래 행렬만 표기
order = "hclust", # 유사한 ìƒê´€ê³„수ë¼ë¦¬ 군집화
addCoef.col = "black", # ìƒê´€ê³„수 색깔
tl.col = "black", # 변수명 색깔
tl.srt = 45, # 변수명 45 ë„ ê¸°ìš¸ìž„
diag = FALSE) # 대ê°í–‰ë ¬ 제외
i-1. ìƒê´€ê³„수를 세 그룹으로 나누어 표현
Plot1. price, bedrooms, bathrooms, sqft_living and sqft lot
House pricing prediction in R(Regression Project)
Plot2. price, floors, waterfront, view, condition and grade
Plot 3. price, yr built, lat and long
< ê²°ê³¼ >
# ìƒê´€ê´€ê³„ê°€ ë†’ì€ ë³€ìˆ˜ë“¤ ( > 0.5)
# sqft_lot15 - sqft_lot : 0.72 # 둘다 삭제
# bathrooms - floors : 0.50 # 변환(모ë¸ë§)
# bathrooms - yr_built : 0.51 # bathrooms 채íƒ
# bathrooms - bedrooms : 0.52 # 변환(모ë¸ë§)
# price - bathrooms : 0.53
# sqft_living15 - bathrooms : 0.57 # bathrooms 채íƒ
# sqft_living15 - price : 0.59
# grade - bathrooms : 0.66 # bathrooms 변환
# grade - price : 0.67
# grade - sqft_living15 : 0.71 # grade 채íƒ
# sqft_living - bedrooms : 0.58 # 변환(모ë¸ë§)
# sqft_living - bathrooms : 0.75 # 변환(모ë¸ë§)
# sqft_living - price : 0.70
# sqft_living - sqft_living15 : 0.76 # sqft_living 채태
# sqft_living - grade : 0.76 # sqft_living 변환
# sqft_above - floors : 0.52 # floors 채íƒ
# sqft_above - bathrooms : 0.69 # bathrooms 채íƒ
# sqft_above - price : 0.61
# sqft_above - sqft_living15 : 0.73 # 둘다 삭제
# sqft_above - grade : 0.76 # grade 채íƒ
# sqft_above - sqft_living : 0.88 # sqft_living 채íƒ
# zipcode - long : -0.56 # zipcode 채íƒ
채íƒë³€ìˆ˜ : bedrooms, bathrooms, sqft_living, floors, view, grade
ii. price ìƒìœ„ 25% ì§‘ì„ ì§€ë„ì— í‘œì‹œ
house_map + ggplot2::geom_point(data = high_25_loc,
aes(x = long, y = lat),
colour="red")
iii. zipcode 별로 ì§‘ì˜ ìœ„ì¹˜ì™€ ë°€ë„를 지ë„ì— í‘œì‹œ
zipcode_1_10_loc <- kc_house[(kc_house$zipcode >=98001)&(kc_house$zipcode
<=98010), c("long", "lat")] #집코드별로 그룹화
zipcode_11_20_loc <- kc_house[(kc_house$zipcode >=98011)&(kc_house$zipcode
<=98020), c("long", "lat")]
zipcode_21_30_loc <- kc_house[(kc_house$zipcode >=98021)&(kc_house$zipcode
<=98030), c("long", "lat")]
zipcode_31_40_loc <- kc_house[(kc_house$zipcode >=98031)&(kc_house$zipcode
<=98040), c("long", "lat")]
zipcode_41_50_loc <- kc_house[(kc_house$zipcode >=98041)&(kc_house$zipcode
<=98050), c("long", "lat")]
house_map <- get_googlemap(center = c(lon =-122.1, lat =47.5),
zoom =10) %>% ggmap
house_map + ggplot2::geom_point(data = zipcode_1_10_loc,
aes(x = long, y = lat),
colour="red") +
ggplot2::geom_point(data = zipcode_11_20_loc,
aes(x = long, y = lat),
colour="orange") +
ggplot2::geom_point(data = zipcode_21_30_loc,
aes(x = long, y = lat),
colour="yellow") +
ggplot2::geom_point(data = zipcode_31_40_loc,
aes(x = long, y = lat),
colour="green") +
ggplot2::geom_point(data = zipcode_41_50_loc,
aes(x = long, y = lat),
colour="blue")
# zipcode group ì´ëŸ°ì‹ìœ¼ë¡œ ì§‘ì„ ggmap ê³¼ ggplot2 패키지를 ì´ìš© ê·¸ë¦¼ì— í‘œí˜„í•´ì¤Œ
zipcode__1_loc <- kc_house[kc_house$zipcode == grep("1$", kc_house$zipcode, value
= TRUE), c("long", "lat")]
zipcode__2_loc <- kc_house[kc_house$zipcode == grep("2$", kc_house$zipcode, value
= TRUE), c("long", "lat")]
zipcode__3_loc <- kc_house[kc_house$zipcode == grep("3$", kc_house$zipcode, value
= TRUE), c("long", "lat")]
house_map + ggplot2::geom_point(data = zipcode__1_loc,
aes(x = long, y = lat),
colour="red") +
ggplot2::geom_point(data = zipcode__2_loc,
aes(x = long, y = lat),
colour="orange") +
ggplot2::geom_point(data = zipcode__3_loc,
aes(x = long, y = lat),
colour="yellow")
(2) 채íƒí•œ ë…립변수
- room_newnum : ì§‘ì˜ í¬ê¸°ì— ì˜í–¥ì„ 미치는 변수들(bathrooms, bedrooms, floors, sqft_living)
ì„ í•˜ë‚˜ì˜ ë³€ìˆ˜ë¡œ 모ë¸ë§
- waterfront : 해안가(waterfront) 여부
- view : ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수
- grade : ì§‘ì— ëŒ€í•œ ì „ë°˜ì ì¸ í‰ê°€ì²™ë„
- X1 ~ X12 : price가 비슷한 zipcode 별 group화
i. room_newnum : ì§‘ì˜ í¬ê¸°ì— ì˜í–¥ì„ 미치는 변수들(bathrooms, bedrooms, floors,
sqft_living)ì„ í•˜ë‚˜ì˜ ë³€ìˆ˜ë¡œ 모ë¸ë§
- 분ì„방법 : ìƒê´€ë¶„ì„
# bedrooms, bathrooms, floors ë°ì´í„°ëª¨ë¸ë§í•˜ê¸°
cor(kc_house$bedrooms, kc_house$price) # 0.308
cor(kc_house$bathrooms, kc_house$price) # 0.525
cor(kc_house$floors, kc_house$price) # 0.257
cor(kc_house$sqft_living, kc_house$price) # 0.702
cor(kc_house$sqft_lot, kc_house$price) # 0.090 # 제외!
kc_house_data$room_newnum = kc_house$bedrooms*0.308+ kc_house$bathrooms*0.525+
kc_house$floors*0.257+ kc_house$sqft_living*0.702
<ë° ì´í„° 모ë¸ë§ ì‹>
kc_house_data$room_newnum = kc_house$bedrooms*0.308
+ kc_house$bathrooms*0.525
+ kc_house$floors*0.257
+ kc_house$sqft_living*0.702
cor(kc_house$room_newnum, kc_house$price) # 0.702
ii. waterfront : 해안가(waterfront) 여부
- 분ì„방법 : 양측 가설검정 (t-test)
# 해안가(=1) group ì˜ ìœ„ì¹˜
waterfront_T <- kc_house[kc_house$waterfront ==1, c("long", "lat")]
# 해안가가 ì•„ë‹Œ(=0) group ì˜ ìœ„ì¹˜
waterfront_F <- kc_house[kc_house$waterfront ==0, c("long", "lat")]
house_map <- get_googlemap(center = c(lon =-122.1, lat =47.5), zoom =10) %>% ggmap
house_map + ggplot2::geom_point(data = waterfront_T,
aes(x = long, y = lat),
colour="red") + ggplot2::geom_point(
data = waterfront_F,
aes(x = long, y = lat),
colour="orange")
# 해안가(=1) group ì˜ price
waterfront_T_price <- kc_house[kc_house$waterfront ==1, "price"]
# 해안가가 ì•„ë‹Œ(=0) group ì˜ price
waterfront_F_price <- kc_house[kc_house$waterfront ==0, "price"]
summary(waterfront_T_price[[1]])
summary(waterfront_F_price[[1]])
t.test(waterfront_T_price[[1]], waterfront_F_price[[1]])
boxplot(waterfront_T_price[[1]], waterfront_F_price[[1]])
ê²°ê³¼
Welch Two Sample t-test
data: waterfront_T_price[[1]] and waterfront_F_price[[1]]
t = 12.876, df = 162.23, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
956963.3 1303661.6
sample estimates:
mean of x mean of y
1661876.0 531563.6
귀무가설 : ë‘ ê·¸ë£¹ì˜ í‰ê· ì€ 같다
대립가설 : ë‘ ê·¸ë£¹ì˜ í‰ê· ì€ 같지 않다
p-value < 0.000 ì´ë¯€ë¡œ, 유ì˜ìˆ˜ì¤€ a=0.05ì—ì„œ ê·€ë¬´ê°€ì„¤ì„ ê¸°ê°í•œë‹¤. 즉, í•´ì•ˆê°€ì— ìœ„ì¹˜í•œ 집들
ì˜ í‰ê·  집값과 í•´ì•ˆê°€ì— ìœ„ì¹˜í•˜ì§€ ì•Šì€ ì§‘ë“¤ì˜ í‰ê·  ì§‘ê°’ì´ ë‹¤ë¥´ë‹¤ê³  ë³¼ 수 있다.
iii. view : ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수
- 분ì„방법 : 여러 집단 ê°„ì˜ ì°¨ì´ê²€ì •(ANOVA, scheffe.test)
view0_price <- kc_house[kc_house$view ==0,"price"]
length(view0_price[[1]]) # 19489
view1_price <- kc_house[kc_house$view ==1,"price"]
length(view1_price[[1]]) # 332
view2_price <- kc_house[kc_house$view ==2,"price"]
length(view2_price[[1]]) # 963
view3_price <- kc_house[kc_house$view ==3,"price"]
length(view3_price[[1]]) # 510
view4_price <- kc_house[kc_house$view ==4,"price"]
length(view4_price[[1]]) # 319
mean(view0_price[[1]]) # 496564.2
mean(view1_price[[1]]) # 812280.8
mean(view2_price[[1]]) # 792400.9
mean(view3_price[[1]]) # 971965.3
mean(view4_price[[1]]) # 1463711
view_price <- c(view0_price[[1]], view1_price[[1]], view2_price[[1]],
view3_price[[1]], view4_price[[1]])
length(view_price) # 21613
length(group) # 21613
group <- c(rep(0, 19489),
rep(1, 332),
rep(2, 963),
rep(3, 510),
rep(4, 319))
cbind(view_price, group)
boxplot(view_price ~ group, ylab = "Price", xlab = "View")
describe.by(view_price, group) # 그룹별기술통계량계산 # mad
ANO_R <- aov(view_price ~ group)
anova(ANO_R)
library(agricolae)
scheffe.test(ANO_R, "group", alpha =0.05, console = TRUE)
LSD.test(ANO_R, "group", alpha =0.05, console = TRUE)
duncan.test(ANO_R, "group", alpha =0.05, console = TRUE)
ê²°ê³¼
> head( cbind(view_price, group) )
view_price group
[1,] 221900 0
[2,] 538000 0
[3,] 180000 0
[4,] 604000 0
[5,] 510000 0
[6,] 1225000 0
> tail( cbind(view_price, group) )
view_price group
[21608,] 580000 4
[21609,] 2300000 4
[21610,] 1149000 4
[21611,] 900000 4
[21612,] 2230000 4
[21613,] 3567000 4
> describe.by(view_price, group) # 그룹별 기술통계량 계산 # mad
Descriptive statistics by group
group: 0
vars n mean sd median trimmed mad min max range skew
kurtosis se
X1 1 19489 496564.2 287133.3 432500 456422 203857.5 75000 5570000 5495000 3.11
21.58 2056.78
------------------------------------------------------------------------------
group: 1
vars n mean sd median trimmed mad min max range skew
kurtosis se
X1 1 332 812280.8 510949.7 690944 722550.6 308714.4 217000 3650000 3433000 2.26
6.65 28042.01
------------------------------------------------------------------------------
group: 2
vars n mean sd median trimmed mad min max range skew kurtosis
se
X1 1 963 792400.9 510105 675000 714267.2 318759 169317 7062500 6893183 3.57 27.65
16437.91
------------------------------------------------------------------------------
group: 3
vars n mean sd median trimmed mad min max range skew
kurtosis se
X1 1 510 971965.3 612692.2 802500 892245.3 450710.4 154000 7700000 7546000 3.38
28.12 27130.47
------------------------------------------------------------------------------
group: 4
vars n mean sd median trimmed mad min max range skew
kurtosis se
X1 1 319 1463711 952209.6 1185000 1320949 667170 252000 6885000 6633000 1.84
4.87 53313.5
> scheffe.test(ANO_R, "group", alpha = 0.05, console = TRUE)
Study: ANO_R ~ "group"
Scheffe Test for view_price
Mean Square Error : 113513294940
group, means
view_price std r Min Max
0 496564.2 287133.3 19489 75000 5570000
1 812280.8 510949.7 332 217000 3650000
2 792400.9 510105.1 963 169317 7062500
3 971965.3 612692.2 510 154000 7700000
4 1463711.2 952209.6 319 252000 6885000
Alpha: 0.05 ; DF Error: 21611
Critical Value of F: 2.372343
Harmonic Mean of Cell Sizes 543.6342
Minimum Significant Difference: 62951.16
Means with the same letter are not significantly different.
Groups, Treatments and means
a 4 1464000
b 3 972000
c 1 812300
c 2 792400
d 0 496600
ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수가 1ë²ˆì¸ ê²½ìš°ì™€ 2ë²ˆì¸ ê²½ìš°ì˜ í‰ê· ì€ 유ì˜ë¯¸í•œ ì°¨ì´ê°€ 발견ë˜ì§€ ì•Šì•„, 비슷
í•œ 그룹ì´ë¼ê³  ë´ë„ 무방하다. ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수가 많ì„ìˆ˜ë¡ ì§‘ì˜ í‰ê· ê°’ì´ ë†’ì•„ì§€ëŠ” 경향ì´
있다.
iv. grade : ì§‘ì— ëŒ€í•œ ì „ë°˜ì ì¸ í‰ê°€ì²™ë„
- 분ì„방법 : ìƒê´€ê´€ê³„ 분ì„
> cor.test(kc_house$grade, kc_house$price)
Pearson's product-moment correlation
data: kc_house$grade and kc_house$price
t = 131.76, df = 21611, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6599749 0.6747621
sample estimates:
cor
0.6674343
p-value < 2.2e-16 ì´ë¯€ë¡œ ìƒê´€ê´€ê³„ 분ì„ì— ëŒ€í•œ 결과가 유ì˜ë¯¸í•˜ë‹¤. 즉, grade변수와 price ë³€
수 사ì´ì— ìƒê´€ê³„수는 0.667 ì´ë©°, 비êµì  ê°•í•œ ì–‘ì˜ ìƒê´€ê´€ê³„ê°€ 있다고 ë³¼ 수 있다.
plot(kc_house$grade, kc_house$price, ylab = "Price", xlab = "Grade")
v. X1 ~ X12 : price가 비슷한 zipcode 별 group화
# zipcode 별 price ì˜í‰ê· ì„ group í™”
kc_house_DT <- as.data.table(kc_house)
zipcode_price_group <- kc_house_DT[ , list(n = .N,
Mean = mean(price)),
by = list(zipcode,
school_el, school_mi,
school_hi, school_to)]
head(zipcode_price_group)
zipcode_price_group[sort(zipcode_price_group$Mean, decreasing =
TRUE),c("zipcode","Mena")]
zipcode_price_group_DT <- as.data.table(zipcode_price_group)
zipcode_price_group_DT <- zipcode_price_group_DT[order(Mean, decreasing =
TRUE) , ]
summary(zipcode_price_group_DT)
# Mean
# Min. : 234284
# 1st Qu.: 354126
# Median : 491952
# Mean : 560774
# 3rd Qu.: 645438
# Max. : 2160607
zipcode_group_1 <- zipcode_price_group_DT[1, "zipcode"]
zipcode_group_2 <- zipcode_price_group_DT[2, "zipcode"]
zipcode_group_3 <- zipcode_price_group_DT[3, "zipcode"]
zipcode_group_4 <- zipcode_price_group_DT[4, "zipcode"]
zipcode_group_5 <- zipcode_price_group_DT[5, "zipcode"]
zipcode_group_6 <- zipcode_price_group_DT[6:11, "zipcode"]
zipcode_group_7 <- zipcode_price_group_DT[12:13, "zipcode"]
zipcode_group_8 <- zipcode_price_group_DT[14:25, "zipcode"]
zipcode_group_9 <- zipcode_price_group_DT[26:34, "zipcode"]
zipcode_group_10 <- zipcode_price_group_DT[35:48, "zipcode"]
zipcode_group_11 <- zipcode_price_group_DT[49:61, "zipcode"]
zipcode_group_12 <- zipcode_price_group_DT[62:70, "zipcode"]
dummies <- data.frame(matrix(nrow = nrow(kc_house), ncol =12))
dummies[,1] <- ifelse(kc_house$zipcode == zipcode_group_1, 1, 0)
dummies[,2] <- ifelse(kc_house$zipcode == zipcode_group_2, 1, 0)
dummies[,3] <- ifelse(kc_house$zipcode == zipcode_group_3, 1, 0)
dummies[,4] <- ifelse(kc_house$zipcode == zipcode_group_4, 1, 0)
dummies[,5] <- ifelse(kc_house$zipcode == zipcode_group_5, 1, 0)
dummies[,6] <- ifelse(kc_house$zipcode == zipcode_group_6, 1, 0)
dummies[,7] <- ifelse(kc_house$zipcode == zipcode_group_7, 1, 0)
dummies[,8] <- ifelse(kc_house$zipcode == zipcode_group_8, 1, 0)
dummies[,9] <- ifelse(kc_house$zipcode == zipcode_group_9, 1, 0)
dummies[,10] <- ifelse(kc_house$zipcode == zipcode_group_10, 1, 0)
dummies[,11] <- ifelse(kc_house$zipcode == zipcode_group_11, 1, 0)
dummies[,12] <- ifelse(kc_house$zipcode == zipcode_group_12, 1, 0)
for(i in1:12){
dummies[,i] <- ifelse(kc_house$cluster == zipcode_group_i, 1, 0)
}
kc_house_data <- cbind(kc_house, dummies)
ê²°ê³¼
> head(zipcode_price_group)
zipcode school_el school_mi school_hi school_to n Mean
1: 98001 16 4 6 26 362 280804.7
2: 98002 7 4 3 14 199 234284.0
3: 98003 10 6 5 21 280 294111.3
4: 98004 5 3 4 12 317 1355927.1
5: 98005 5 3 5 13 168 810164.9
6: 98006 9 3 2 14 498 859684.8
(3) 채íƒí•˜ì§€ 못한 ë…립변수
- house_age : 주íƒì´ ì˜¤ëž˜ëœ ì •ë„ (2017 – yr_built)
- renovated_TF : renovateì˜ ì—¬ë¶€
- school_to : zipcode group 별 í•™êµìˆ˜
- season_price : ì§‘ì´ íŒ”ë¦° 계절별(ë´„/여름/ê°€ì„/겨울)
i. house_age : 주íƒì´ ì˜¤ëž˜ëœ ì •ë„ (2017 – yr_built)
- 분ì„방법 : ìƒê´€ê´€ê³„ 분ì„
summary(kc_house$yr_built)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1900 1951 1975 1971 1997 2015
house_age =2017- kc_house$yr_built
summary(house_age)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 2.00 20.00 42.00 45.99 66.00 117.00
cor(house_age, kc_house$price) -0.054
cor.test(house_age, kc_house$price)
plot(house_age, kc_house$price)
ê²°ê³¼
Pearson's product-moment correlation
data: house_age and kc_house$price
t = -7.9517, df = 21611, p-value = 1.93e-15
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.06729506 -0.04070886
sample estimates:
cor
-0.05401153
p-valueê°€ 0.000ê°’ì´ë¯€ë¡œ 유ì˜ìˆ˜ì¤€ a=0.05ì—ì„œ ìƒê´€ê³„ìˆ˜ì˜ ê°’ì€ ìœ ì˜ë¯¸í•˜ë‹¤. ë”°ë¼ì„œ ì§‘ì´ ì§€ì–´
진 ì—°ë„와 price와 ìƒê´€ê´€ê³„ê°€ ê±°ì˜ ì—†ë‹¤ê³  ë³¼ 수 있다.
ii. renovated_TF : renovateì˜ ì—¬ë¶€
- 분ì„방법 : 양측 가설검정 (t-test)
# renovate ëœ group ì˜ ì§‘ê°’ì´ renovate ì•ˆëœ group ì˜ ì§‘ê°’ë³´ë‹¤ 높다
renovate_house <- kc_house[kc_house$yr_renovated !=0, "price"] # renovate ëœ
집들ì˜ê°€ê²©
not_renovate_house <- kc_house[kc_house$yr_renovated ==0, "price"] # renovate
ì•ˆëœ ì§‘ë“¤ì˜ê°€ê²©
t.test(renovate_house[[1]], not_renovate_house[[1]])
# p-value < 2.2e-16 ì´ ë¯€ë¡œ ë‘ ê·¸ë£¹ì˜ í‰ê· ì€ ê°™ 지 않다. (H0 기 ê°)
# mean of x mean of y
# 760379.0 530360.8
Welch Two Sample t-test
ê²°ê³¼ >
data: renovate_house[[1]] and not_renovate_house[[1]]
t = 11.36, df = 939.86, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
190280.9 269755.5
sample estimates:
mean of x mean of y
760379.0 530360.8
p-valueê°’ì´ 0.000ì´ë¯€ë¡œ t.test ê²°ê³¼, renovateëœ ì§‘ê³¼ renovateê°€ ë˜ì§€ ì•Šì€ ì§‘ 사ì´ì˜ í‰ê· 
ê°’ì—는 ì°¨ì´ê°€ 나타났다.
하지만 회귀검정 과정ì—ì„œ ì´ ë³€ìˆ˜ë¥¼ ì œì™¸í•˜ì˜€ì„ ë•Œì˜ R-squareê°’ì´ ì´ ë³€ìˆ˜ë¥¼ ì¶”ê°€í•˜ì˜€ì„ ë•Œ
ì˜ R-square값보다 높았기 ë•Œë¬¸ì— íšŒê·€ëª¨í˜• 변수로 채íƒí•˜ì§€ 않았다.
iii. school_to : zipcode group 별 í•™êµìˆ˜
kc_house <- readxl::read_excel(path = "kc_house_data.xlsx",
sheet = 1,
col_names = TRUE)
zipcode_school <- readxl::read_excel(path = "zipcode_school.xlsx",
sheet = 1,
col_names = TRUE)
str(zipcode_school)
head(zipcode_school)
kc_house_DT <- as.data.table(kc_house)
# ì›ë°ì´í„° kc_house 와 zipcode_school ë°ì´í„°ë¥¼ join í•´ì„œ kc_house ì— ë„£ì–´ì¤Œ
kc_house <- merge(kc_house, zipcode_school, by = "zipcode", all = TRUE)
View(kc_house)
# í•™êµìˆ˜ 내림차순으로 zipcode ì •ë ¬
x <- kc_house[order(kc_house$school_to, decreasing = TRUE) ,
c("zipcode","school_to")]
unique(x) # 중복값 제거
# price 내림차순으로 zipcode 정렬
kc_house[order(kc_house$price, decreasing = TRUE) , c("price","zipcode")]
cor(zipcode_price_group[,2][[1]], zipcode_price_group[,7][[1]]) # el - price #
-0.2
cor(zipcode_price_group[,3][[1]], zipcode_price_group[,7][[1]]) # mi - price #
-0.2
cor(zipcode_price_group[,4][[1]], zipcode_price_group[,7][[1]]) # hi - price #
-0.3
cor(zipcode_price_group[,5][[1]], zipcode_price_group[,7][[1]]) # to - price #
-0.3
cor.test(zipcode_price_group[,2][[1]], zipcode_price_group[,7][[1]]) # el -
price # -0.2
cor.test(zipcode_price_group[,3][[1]], zipcode_price_group[,7][[1]]) # mi -
price # -0.2
cor.test(zipcode_price_group[,4][[1]], zipcode_price_group[,7][[1]]) # hi -
price # -0.3
cor.test(zipcode_price_group[,5][[1]], zipcode_price_group[,7][[1]]) # to -
price # -0.3
í•™êµ ìˆ˜ê°€ 많ì„ìˆ˜ë¡ ì§‘ê°’ì´ ë‹¤ì†Œ 하ë½í•˜ëŠ” ê²½í–¥ì„ ë³´ì¸ë‹¤. 하지만 ìƒê´€ê´€ê³„ê°€ 0.2~0.3 으로
낮게 나왔기 ë•Œë¬¸ì— ì´ˆë“±í•™êµ, 중학êµ, 고등학êµ, ì „ì²´ í•™êµì˜ 숫ìžëŠ” price ì— í° ì˜í–¥ì„
미치지 않는다고 볼 수 있다.
iv. season_price : ì§‘ì´ íŒ”ë¦° 계절별(ë´„/여름/ê°€ì„/겨울)
- 분ì„방법 : 여러집단 사ì´ì˜ ì°¨ì´ê²€ì • (ANOVA, scheffe, test)
head(date_numeric)
# 새로운변수ìƒì„±
kc_house$date_numeric = date_numeric
# 집ì´íŒ”린날짜가
# ë´„(3,4,5 ì›”) / 여름(6,7,8 ì›”) / ê°€ì„(9,10,11 ì›”) / 겨울(12,1,2 ì›”) 별로 price 그룹화
spr_price <- kc_house[grep("....03..|....04..|....05..", date_numeric), "price"]
sum_price <- kc_house[grep("....06..|....07..|....08..", date_numeric), "price"]
fal_price <- kc_house[grep("....09..|....10..|....11..", date_numeric), "price"]
win_price <- kc_house[grep("....12..|....01..|....02..", date_numeric), "price"]
mean(spr_price[[1]])
mean(sum_price[[1]])
mean(fal_price[[1]])
mean(win_price[[1]])
length(spr_price[[1]]) + length(sum_price[[1]]) + length(fal_price[[1]]) +
length(win_price[[1]])
# 21613 "전체를다가져왔는지확ì¸" OK
seson_price <- c(spr_price[[1]], sum_price[[1]], fal_price[[1]], win_price[[1]])
group <- c(rep(1, length(spr_price[[1]])),
rep(2, length(sum_price[[1]])),
rep(3, length(fal_price[[1]])),
rep(4, length(win_price[[1]])))
length(group) # 21613
cbind(seson_price, group)
boxplot(seson_price ~ group)
describe.by(seson_price, group) # 그룹별기술통계량계산
ANO_R<-aov(seson_price ~ group)
anova(ANO_R)
scheffe.test(ANO_R, "group", alpha =0.05, console = TRUE)
LSD.test(ANO_R, "group", alpha =0.05, console = TRUE)
duncan.test(ANO_R, "group", alpha =0.05, console = TRUE)
t.test(fal_price[[1]], win_price[[1]])
ê²°ê³¼
> head( cbind(seson_price, group) )
seson_price group
[1,] 538000 1
[2,] 180000 1
[3,] 310000 1
[4,] 530000 1
[5,] 650000 1
[6,] 485000 1
> tail( cbind(seson_price, group) )
seson_price group
[21608,] 330000 4
[21609,] 230000 4
[21610,] 645000 4
[21611,] 414500 4
[21612,] 347500 4
[21613,] 350000 4
> describe.by(seson_price, group) # 그룹별 기술통계량 계산
Descriptive statistics by group
group: 1
vars n mean sd median trimmed mad min max range skew
kurtosis se
X1 1 6520 543036.7 363293.6 455700 486463.7 228765.2 78000 7062500 6984500 4
34.18 4499.19
------------------------------------------------------------------------------
group: 2
vars n mean sd median trimmed mad min max range skew
kurtosis se
X1 1 6331 543183.9 377206.2 450000 480799 222390 75000 5570000 5495000 3.72
24.97 4740.7
------------------------------------------------------------------------------
group: 3
vars n mean sd median trimmed mad min max range skew kurtosis
se
X1 1 5063 536213 365608 450000 480103.4 222390 82500 7700000 7617500 5.02 60.6
5138.21
------------------------------------------------------------------------------
group: 4
vars n mean sd median trimmed mad min max range skew
kurtosis se
X1 1 3699 534896.4 358372.5 447500 477146.5 218683.5 83000 3800000 3717000 3.22
16.54 5892.4
> anova(ANO_R)
Analysis of Variance Table
Response: seson_price
Df Sum Sq Mean Sq F value Pr(>F)
group 1 2.4117e+11 2.4117e+11 1.7894 0.181
Residuals 21611 2.9127e+15 1.3478e+11
> scheffe.test(ANO_R, "group", alpha = 0.05, console = TRUE)
Study: ANO_R ~ "group"
Scheffe Test for seson_price
Mean Square Error : 134777455496
group, means
seson_price std r Min Max
1 543036.7 363293.6 6520 78000 7062500
2 543183.9 377206.2 6331 75000 5570000
3 536213.0 365608.0 5063 82500 7700000
4 534896.4 358372.4 3699 83000 3800000
Alpha: 0.05 ; DF Error: 21611
Critical Value of F: 2.60532
Harmonic Mean of Cell Sizes 5133.59
Minimum Significant Difference: 20258.36
Means with the same letter are not significantly different.
Groups, Treatments and means
a 2 543200
a 1 543000
a 3 536200
a 4 534900
계절(ë´„/여름/ê°€ì„/겨울)ì— ë”°ë¥¸ ê·¸ë£¹ì˜ í‰ê· ì—ì„œ 유ì˜ë¯¸í•œ ì°¨ì´ê°€ 나타나지 않았다. ë”°ë¼ì„œ, 계
ì ˆì— ë”°ë¥¸ priceì—는 ì°¨ì´ê°€ 없다고 ë³¼ 수 있다.
3. ìžë£Œ 분ì„
A. ë¶„ì„ ë°©ë²•
i. 회귀 모형
ii. íšŒê·€ëª¨í˜•ì— ë“¤ì–´ê°ˆ ë°ì´í„° 전처리
# date 변수를 numeric 으로 바꾸기
date_numeric <- substr(kc_house_data$date, 1, 8) # 20141124 형태로 ì—°ë„날짜
문ìžë¶€ë¶„만 추출
date_numeric <- as.numeric(date_numeric)
is.numeric(date_numeric) # TRUE
kc_house_data$date <- date_numeric # ê¸°ì¡´ì˜ date 를 새로운 numeric date ë¡œ 대체
집값 ~ í¬ê¸°ë¥¼ 나타내는 변수(bathsrooms, bedrooms, floors, sqft_living)
+ 해안가 여부(waterfront _ T/F)
+ ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수(view _ 0/1/2/3/4)
+ 집 í‰ê°€ì²™ë„(grade)
+ zipcode group(ì´ 12ê°œ)ì— ëŒ€í•œ ë”미변수
house_lm = lm(kc_house_data$price ~ room_newnum + waterfront + view +
grade + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12,
data = kc_house_data)
# í¬ê¸°ë¥¼ 나타내는 ë³€ìˆ˜ë“¤ì„ ë¬¶ì–´ì£¼ê¸°
kc_house_data$room_newnum = kc_house$bedrooms*0.308 + kc_house$bathrooms*0.525 +
kc_house$floors*0.257 + kc_house$sqft_living*0.702
cor(kc_house$room_newnum, kc_house$price) # 0.702
View(kc_house_data)
iii. zipcode 별로 priceì— ëŒ€í•œ 정규성 검정
# by(A, B, shapiro.test) # Bì— ìžˆëŠ” 모든 ì§‘ë‹¨ì˜ Aê°’ì— ëŒ€í•´ 정규성 ê²€ì •ì„ í•œë‹¤
A <- by(kc_house$price, kc_house$zipcode, shapiro.test)
## p-value > 0.05ì¸ zipcode group
###
# kc_house$zipcode: 98002
#
# Shapiro-Wilk normality test
#
# data: dd[x, ]
# W = 0.99639, p-value = 0.9243
###
# kc_house$zipcode: 98108
#
# Shapiro-Wilk normality test
#
# data: dd[x, ]
# W = 0.99176, p-value = 0.3707
###
B. ë¶„ì„ ê²°ê³¼
i. Stepwiseì— ì˜í•œ ì˜í–¥ë ¥ì´ 있는 ë³€ìˆ˜ì˜ ì„ ë³„
.
Call:
lm(formula = kc_house_data$price ~ room_newnum + waterfront +
view + grade + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 +
X10 + X11 + X12, data = kc_house_data)
Residuals:
Min 1Q Median 3Q Max
-1168280 -111183 -11201 93153 4695648
(2) Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -428984.61 11000.94 -38.995 < 2e-16 ***
room_newnum 219.91 3.37 65.248 < 2e-16 ***
waterfront 596633.57 17493.97 34.105 < 2e-16 ***
view 73086.65 2054.59 35.572 < 2e-16 ***
grade 78201.44 1841.75 42.460 < 2e-16 ***
X1 1210393.45 28972.94 41.777 < 2e-16 ***
X2 631557.38 11627.49 54.316 < 2e-16 ***
X3 359493.57 12361.11 29.083 < 2e-16 ***
X4 467049.42 12554.92 37.200 < 2e-16 ***
X5 335753.53 19958.60 16.822 < 2e-16 ***
X6 187139.64 12507.13 14.963 < 2e-16 ***
X7 118286.47 11234.48 10.529 < 2e-16 ***
X8 64278.26 11002.59 5.842 5.23e-09 ***
X9 85789.09 10854.34 7.904 2.84e-15 ***
X10 -32043.68 12036.78 -2.662 0.00777 **
X11 -103019.44 10948.51 -9.409 < 2e-16 ***
X12 -152845.27 13182.95 -11.594 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 203600 on 21596 degrees of freedom
Multiple R-squared: 0.6926, Adjusted R-squared: 0.6924
(1) F-statistic: 3041 on 16 and 21596 DF, p-value: < 2.2e-16
ii. íšŒê·€ë¶„ì„ ê²°ê³¼ 분ì„
1단계 : íšŒê·€ëª¨í˜•ì€ í†µê³„ì ìœ¼ë¡œ 타당한가?
귀무가설 : íšŒê·€ëª¨í˜•ì€ íƒ€ë‹¹í•˜ì§€ 않다.
대립가설 : íšŒê·€ëª¨í˜•ì€ íƒ€ë‹¹í•˜ë‹¤.
(1) F-statistic: 3041 on 16 and 21596 DF, p-value: < 2.2e-16
(1)ì˜ ì¶œë ¥ ê²°ê³¼ë¬¼ì„ ë³´ë©´ p-value ê°’ì´ 0.000 ì´ë¯€ë¡œ ê·€ë¬´ê°€ì„¤ì„ ê¸°ê°í•œë‹¤.
1ë‹¨ê³„ì˜ ê²°ë¡  : 대립가설, íšŒê·€ëª¨í˜•ì€ íƒ€ë‹¹í•˜ë‹¤
2단계 : ë…립변수 ê°ê°ì€ 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 주는가?
귀무가설 : ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 주지 않는다.
대립가설 : ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 준다.
(2) Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -428984.61 11000.94 -38.995 < 2e-16 ***
room_newnum 219.91 3.37 65.248 < 2e-16 ***
waterfront 596633.57 17493.97 34.105 < 2e-16 ***
view 73086.65 2054.59 35.572 < 2e-16 ***
grade 78201.44 1841.75 42.460 < 2e-16 ***
X1 1210393.45 28972.94 41.777 < 2e-16 ***
X2 631557.38 11627.49 54.316 < 2e-16 ***
X3 359493.57 12361.11 29.083 < 2e-16 ***
X4 467049.42 12554.92 37.200 < 2e-16 ***
X5 335753.53 19958.60 16.822 < 2e-16 ***
X6 187139.64 12507.13 14.963 < 2e-16 ***
X7 118286.47 11234.48 10.529 < 2e-16 ***
X8 64278.26 11002.59 5.842 5.23e-09 ***
X9 85789.09 10854.34 7.904 2.84e-15 ***
X10 -32043.68 12036.78 -2.662 0.00777 **
X11 -103019.44 10948.51 -9.409 < 2e-16 ***
X12 -152845.27 13182.95 -11.594 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(2)ì˜ ì¶œë ¥ ê²°ê³¼ë¬¼ì„ ë³´ë©´ 유ì˜í™•ë¥ (Pr(>|t|))ì´ 0.000 ì— ê°€ê¹Œìš°ë¯€ë¡œ ê·€ë¬´ê°€ì„¤ì„ ê¸°ê°í•˜ì—¬,
유ì˜ìˆ˜ì¤€ì¸ 0.05 구간ì—ì„œ ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 준다는 ëŒ€ë¦½ê°€ì„¤ì„ ì±„íƒí•œë‹¤.
2ë‹¨ê³„ì˜ ê²°ë¡  : ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 준다.
ë³€ìˆ˜ì„ íƒ : 단계 ì„ íƒë²•(Stepwise Selection)방법 ì„ íƒ
step(회귀분ì„결과물, direction = c("forward", "backward", "both"))
model.stepwise = step(house_lm, direction = "both")
summary(model.stepwise)
3단계 : ë³€ìˆ˜ì„ íƒ (결론출력)
ë…립변수ì˜íšŒê·€ê³„수(coefficient of Regression) : 0.6926
4단계 : 회귀모형ì˜ì„¤ëª…ë ¥ = ë…립변수ì˜ì„¤ëª…
Multiple R-squared : 0.6926
0.6926 * 100 = 69.2%
ì„¤ëª…ê³„ìˆ˜ì¸ R-squared 를 출력한 결과가 price ì˜ ë‹¤ë¦„ì„ ì•½ 69.2%를 설명한다.
단, 고려해야할 ì ì´ 있다. 만약, ìµœì¢…íšŒê·€ëª¨í˜•ì— ë…립변수가 2ê°œì´ìƒ í¬í•¨ì´ ë˜ë©´
첫째, íšŒê·€ê³„ìˆ˜ì˜ í•´ì„ì„ í™•ì¸í•´ì•¼ 한다. ë…립변수1ì€ ë‚˜ë¨¸ì§€ ë…ë¦½ë³€ìˆ˜ë“¤ì´ ê³ ì •ë˜ì–´ 있ì„
ë•Œì—(통제) 1ì˜ê¸°ë³¸ë‹¨ìœ„ê°€ 1 ì¦ê°€í•˜ë©´ 종ì†ë³€ìˆ˜ëŠ” 약 얼마 ì¦ê°€/ê°ì†Œí•˜ëŠ”지 고려해야 한다.
둘째, 다중공선성(Multicollinearity)ì„ í™•ì¸í•´ì•¼ 한다. ë…ë¦½ë³€ìˆ˜ë“¤ê°„ì˜ ì„ í˜•ì˜ ê´€ê³„ëŠ” 없어야
한다.
예를 들어, VIF(Varaince Inflation Factor) : 10 ì´ìƒì´ë©´ ë‹¤ì¤‘ê³µì„ ì„±ì´ ì¡´ìž¬í•œë‹¤ê³  íŒë‹¨í•˜ë©°,
ë…립변수들 ê°„ì— ì„ í˜•ì˜ ê´€ê³„ê°€ ì¡´ìž¬í•¨ì„ ì•Œ 수 있다. 만약 ì´ëŸ¬í•œ 결과가 나온다면
ë…립변수들 ì¤‘ì— ë¹¼ëŠ” ê²ƒì„ ê²€í† í•œë‹¤.
다중공선성(Multicollinearity)ì„ í™•ì¸
library(car)
car::vif(model.stepwise)
> car::vif(model.stepwise)
room_newnum waterfront view grade X1 X2 X3
X4
2.465253 1.194046 1.292139 2.442943 1.009935 1.018493 1.025663
1.009917
X5 X6 X7 X8 X9 X10 X11
X12
1.003873 1.005920 1.012795 1.005337 1.003162 1.003162 1.003869
1.002985
모든 ë³€ìˆ˜ë“¤ì— ëŒ€í•œ VIF ê°’ì´ 10 ì´í•˜ì´ë¯€ë¡œ ë…립변수 ê°„ì˜ ë‹¤ì¤‘ê³µì„ ì„±ì€ ì¡´ìž¬í•˜ì§€ 않는다.
íšŒê·€ëª¨í˜•ì˜ ì„¤ëª…ë ¥ : Adjusted R-Square
Adjusted R-squared: 0.6924
ë…ë¦½ë³€ìˆ˜ë“¤ì˜ ì˜í–¥ë ¥ í¬ê¸° 비êµ
library(lm.beta)
lm.beta::lm.beta(attitude.lm)
> lm.beta::lm.beta(model.stepwise)
Call:
lm(formula = kc_house_data$price ~ room_newnum + waterfront +
view + grade + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 +
X10 + X11 + X12, data = kc_house_data)
Standardized Coefficients::
(Intercept) room_newnum waterfront view grade X1 X2
X3
0.00000000 0.38651375 0.14060267 0.15255635 0.25038342 0.15839614 0.20680922
0.11112192
X4 X5 X6 X7 X8 X9 X10
X11
0.14104423 0.06359071 0.05661791 0.03997667 0.02209983 0.02986607 -0.01005961 -
0.03556859
X12
-0.04380771
4. 예측
kc_house_final <-
kc_house_data[,c("room_newnum","waterfront","view","grade","X1","X2","X3","X4","X5","X6","X7","X
8","X9","X10","X11","X12")]
# predict(회귀분ì„ê²°ê³¼, newdata = data.frame(complaints = ))
predict(model.stepwise, newdata = data.frame(kc_house_final[1:5,]), interval = "predict")
View(kc_house_data)
1) 예측값
> predict(model.stepwise, newdata = data.frame(kc_house_final[1:6,]), interval = "predict") # ì ì¶”
ì •?
fit lwr upr
1 219839.6 -186103.7 625782.8
2 285515.2 -120405.3 691435.7
3 268629.0 -137296.2 674554.2
4 493984.5 87995.8 899973.1
2) 실제값
zipcode id date price
1 98001 302000375 20140814 169100
2 98001 6181400920 20150430 286651
3 98001 2005950050 20140527 260000
4 98001 8956200070 20140905 447500
5. 최종 결론
í‘œì¤€í™”ëœ íšŒê·€ê³„ìˆ˜ì— ì˜í•´ì„œ, priceì— ê°€ìž¥ ì˜í–¥ì„ 미치는 변수는 ì§‘ì˜ í¬ê¸°ì™€ ê´€ë ¨ëœ ë³€
수들(화장실수, 침실수, 층수, í‰ìˆ˜)와 gradeë¼ëŠ” ê²ƒì„ ì•Œ 수 있다.
변수명 변수 설명 í‘œì¤€í™”ëœ ìƒê´€ê³„수
room_newnum
ì§‘ì˜ í¬ê¸°ì™€ ê´€ë ¨ëœ ë³€ìˆ˜ë“¤ì„ í•œê°€ì§€
ì˜ ë³€ìˆ˜ë¡œ 모ë¸ë§ í•œ 변수
(화장실수, 침실수, 층수, í‰ìˆ˜)
0.387
grade
Kingcounty grading systemì— ì˜í•œ
ì§‘ì— ëŒ€í•œ í‰ê°€ì²™ë„
0.250
6. ë¯¸ë¹„ì  ë° ê°œì„ ë°©í–¥
- 집 í¬ê¸°ì— ì˜í–¥ì„ 받는 변수들(bathrooms, bedrooms, floors, sqft_living)ì„ í•˜ë‚˜ì˜ ë°ì´í„°ë¡œ 모ë¸
ë§í•˜ëŠ” 과정ì—ì„œ 가중치 ì„ ì •ë°©ë²•ì´ ë¯¸í¡í–ˆë˜ 것 같다. ì´ëŸ¬í•œ ë³€ìˆ˜ë“¤ì´ ì‹¤ì œë¡œ ì§‘ê°’ì— ì˜í–¥ì„
미치는 ì •ë„ì— ë”°ë¼ì„œ 가중치를 선정하려고 했으나, 논문ìžë£Œì—ì„œ ì´ì— 대한 정보를 찾지 못했다.
그래서 차선책으로 ê° ë³€ìˆ˜ë“¤ì— ëŒ€í•œ ìƒê´€ê³„수를 곱해서 ë”해주는 ë°©ì‹ì„ ì„ íƒí–ˆë‹¤. ì´ ë°©ë²•ì€
현재 주어진 ìžë£Œì—서만 유효하므로, 다ìŒì— 기회가 ëœë‹¤ë©´ ì „ì²´ ì§‘ê°’ì— ìœ ì˜ë¯¸í•œ 모ë¸ë§ì„ í•´ë³´
고 싶다.
- 집 별로 í‰ìˆ˜ì˜ ì°¨ì´ê°€ ìžˆëŠ”ë° ë‹¨ìœ„ í‰ìˆ˜ë‹¹ ê°€ê²©ì„ ê³ ë ¤í•˜ì§€ ì•Šê³  단순히 zipcode별 ì§‘ê°’ì— ëŒ€
í•œ 분ì„ì„ ì§„í–‰í•˜ì˜€ë‹¤. ë”°ë¼ì„œ í‰ìˆ˜ë‹¹ 가격보다는 ì§€ì—­ì˜ ìš”ì¸ì— 중ì ì„ ë‘었으며, 누ë½ë˜ëŠ” 부분
ì´ ì¶©ë¶„ížˆ ìžˆì—ˆì„ ê²ƒì´ë¼ê³  íŒë‹¨ëœë‹¤.
- 주어진 ë°ì´í„° ì´ì™¸ì— 새로운 ë³€ìˆ˜ë“¤ì„ ë§Œë“¤ì–´ë‚´ë ¤ê³  노력하였는ë°, ë§Žì€ ë³€ìˆ˜ë“¤(School,
Seasonality 등)ì´ priceì— ìœ ì˜ë¯¸í•œ ì˜í–¥ì„ 주지 ì•Šì•„ 채íƒí•˜ì§€ 못했다. 변수를 세부그룹으로 나
누거나 ì´ìƒì¹˜ë¥¼ 제거하는 등 ë°ì´í„° 전처리 ê³¼ì •ì„ ê±°ì¹œë‹¤ë©´ 채íƒí•˜ì§€ 못한 변수들과 price 사
ì´ì— 유ì˜ë¯¸í•œ ìƒê´€ê´€ê³„를 ì°¾ì„ ìˆ˜ ìžˆì„ ê²ƒì´ë¼ê³  ìƒê°í•œë‹¤. ì „ì²´ì ìœ¼ë¡œ ë°ì´í„°ë¥¼ 전처리 하는
ê³¼ì •ì´ ë¶€ì¡±í•˜ì˜€ë˜ ì ì´ 아쉬웠다.
- 관련 ë…¼ë¬¸ì„ ì°¾ì•„ë³´ë‹¤ê°€ ì§‘ê°’ì´ ë¹„ì‹¸ì§ˆìˆ˜ë¡ í™”ìž¥ì‹¤ìˆ˜, í‰ìˆ˜ 등과 ê°™ì€ ë¬¼ë¦¬ì ì¸ 특성보다 해안
ê°€, í•™êµ ìˆ˜, 백화ì ê³¼ì˜ 거리 ë“±ì˜ ìž…ì§€ì ì¸ íŠ¹ì„±ì— ë”ìš± ì˜í–¥ì„ ë§Žì´ ë°›ëŠ”ë‹¤ëŠ” 연구결과를 알게
ë˜ì—ˆë‹¤. ì§‘ê°’ì´ ë¹„ì‹¼ 집들과 비싸지 ì•Šì€ ì§‘ë“¤ì„ ê°ê° 분ì„í•´ì„œ 여러 ê°€ì§€ì˜ íšŒê·€ëª¨í˜•ì„ ì„ ì •í•´
보는 ê²ƒë„ ì¢‹ì€ í”„ë¡œì íŠ¸ê°€ ë˜ì—ˆì„ 것 같다.
종ì†ë³€ìˆ˜ì— 대한 시계열 ìžë£Œ
house_price_ts <- ts(kc_house$price, start = c(2014, 182), end = c(2015, 182),
frequency = 365)
plot(house_price_ts, ylab = "Kingcounty House Price", xlab = "Year.Month",
xlim = c(2014.5, 2015.5))
par(mfrow = c(1,2))
acf(house_price_ts) # 파란색 ì ì„  ë°‘ì— ê·¸ëž˜í”„ê°€ 있어야 유ì˜í•œê²ƒì´ë‹¤!
pacf(house_price_ts)
ndiffs(house_price_ts) # 0
# 아주 강력한 function! auto.arima
houseBest <- auto.arima(x = house_price_ts)
houseBest
par(mfrow = c(1,2))
library(scales)
forecast(houseBest, h = 5) -> houseforecast # 80%, 95% ì‹ ë¢°êµ¬ê°„ì´ ê°™ì´ ë‚˜ì˜´
plot(houseforecast)
House pricing prediction in R(Regression Project)
House pricing prediction in R(Regression Project)

More Related Content

House pricing prediction in R(Regression Project)

  • 1. ë‹¤ì¤‘íšŒê·€ë¶„ì„ Multi-Regression Analyse â€œë¯¸êµ­ì˜ ì§‘ê°’ì— ì˜í–¥ì„ 미치는 ìš”ì¸ë“¤ì— 대한 분섆House Pricing 1. ë…ë¦½ë³€ìˆ˜ì— ëŒ€í•œ 설명 (1) ìžë£Œì— 대한 요약 i. 모든 변수들 ê°„ì˜ ìƒê´€ê´€ê³„ # date 변수를 numeric 으로 바꾸기 date_numeric <- substr(kc_house$date, 1, 8) # 20141124 형태로 ì—°ë„날짜 문ìžë¶€ë¶„만 추출 is.numeric(date_numeric) # FALSE date_numeric <- as.numeric(date_numeric) is.numeric(date_numeric) # TRUE kc_house$date <- date_numeric # ê¸°ì¡´ì˜ date 를 새로운 numeric date ë¡œ 대체 View(kc_house) house_cor <- cor(kc_house) # 모든 ë³€ìˆ˜ë“¤ì˜ ì„œë¡œì„œë¡œ ê°„ì˜ ìƒê´€ê´€ê³„를 계산 # library(psych) house_cortest <- psych::corr.test(kc_house) DT::datatable(house_cortest) View(house_cortest) library(Hmisc) Hmisc::rcorr(as.matrix(attitude)) round(house_cor, 2) # ì†Œìˆ«ì  ë‘˜ì§¸ìžë¦¬ê¹Œì§€ round pairs(house_cor, pch = 19, bg = c("red", "green", "blue")) # 행렬모양 ì‚°ì ë„ corrplot(house_cor) # ìƒê´€ì›ê³„수가 í´ìˆ˜ë¡ í¬ê¸°ê°€ í¬ê³  ìƒ‰ê¹”ì´ ì§„í•˜ë‹¤ # 양수면 파란색, ìŒìˆ˜ë©´ 붉ì€ìƒ‰ corrplot(house_cor, method = "number") # 수와 색깔로 표현 col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA")) corrplot(house_cor, method = "color", # 색깔로 표현 col = col(200), # ìƒ‰ìƒ 200 ê°œ ì„ ì • type = "lower", # 왼쪽 아래 행렬만 표기 order = "hclust", # 유사한 ìƒê´€ê³„수ë¼ë¦¬ 군집화 addCoef.col = "black", # ìƒê´€ê³„수 색깔 tl.col = "black", # 변수명 색깔 tl.srt = 45, # 변수명 45 ë„ ê¸°ìš¸ìž„ diag = FALSE) # 대ê°í–‰ë ¬ 제외
  • 2. i-1. ìƒê´€ê³„수를 세 그룹으로 나누어 표현 Plot1. price, bedrooms, bathrooms, sqft_living and sqft lot
  • 4. Plot2. price, floors, waterfront, view, condition and grade
  • 5. Plot 3. price, yr built, lat and long < ê²°ê³¼ > # ìƒê´€ê´€ê³„ê°€ ë†’ì€ ë³€ìˆ˜ë“¤ ( > 0.5) # sqft_lot15 - sqft_lot : 0.72 # 둘다 ì‚­ì œ # bathrooms - floors : 0.50 # 변환(모ë¸ë§) # bathrooms - yr_built : 0.51 # bathrooms ì±„íƒ # bathrooms - bedrooms : 0.52 # 변환(모ë¸ë§) # price - bathrooms : 0.53 # sqft_living15 - bathrooms : 0.57 # bathrooms ì±„íƒ # sqft_living15 - price : 0.59 # grade - bathrooms : 0.66 # bathrooms 변환 # grade - price : 0.67 # grade - sqft_living15 : 0.71 # grade ì±„íƒ # sqft_living - bedrooms : 0.58 # 변환(모ë¸ë§)
  • 6. # sqft_living - bathrooms : 0.75 # 변환(모ë¸ë§) # sqft_living - price : 0.70 # sqft_living - sqft_living15 : 0.76 # sqft_living 채태 # sqft_living - grade : 0.76 # sqft_living 변환 # sqft_above - floors : 0.52 # floors ì±„íƒ # sqft_above - bathrooms : 0.69 # bathrooms ì±„íƒ # sqft_above - price : 0.61 # sqft_above - sqft_living15 : 0.73 # 둘다 ì‚­ì œ # sqft_above - grade : 0.76 # grade ì±„íƒ # sqft_above - sqft_living : 0.88 # sqft_living ì±„íƒ # zipcode - long : -0.56 # zipcode ì±„íƒ ì±„íƒë³€ìˆ˜ : bedrooms, bathrooms, sqft_living, floors, view, grade ii. price ìƒìœ„ 25% ì§‘ì„ ì§€ë„ì— í‘œì‹œ house_map + ggplot2::geom_point(data = high_25_loc, aes(x = long, y = lat), colour="red")
  • 7. iii. zipcode 별로 ì§‘ì˜ ìœ„ì¹˜ì™€ ë°€ë„를 지ë„ì— í‘œì‹œ zipcode_1_10_loc <- kc_house[(kc_house$zipcode >=98001)&(kc_house$zipcode <=98010), c("long", "lat")] #집코드별로 그룹화 zipcode_11_20_loc <- kc_house[(kc_house$zipcode >=98011)&(kc_house$zipcode <=98020), c("long", "lat")] zipcode_21_30_loc <- kc_house[(kc_house$zipcode >=98021)&(kc_house$zipcode <=98030), c("long", "lat")] zipcode_31_40_loc <- kc_house[(kc_house$zipcode >=98031)&(kc_house$zipcode <=98040), c("long", "lat")] zipcode_41_50_loc <- kc_house[(kc_house$zipcode >=98041)&(kc_house$zipcode <=98050), c("long", "lat")] house_map <- get_googlemap(center = c(lon =-122.1, lat =47.5), zoom =10) %>% ggmap house_map + ggplot2::geom_point(data = zipcode_1_10_loc, aes(x = long, y = lat), colour="red") + ggplot2::geom_point(data = zipcode_11_20_loc, aes(x = long, y = lat), colour="orange") + ggplot2::geom_point(data = zipcode_21_30_loc, aes(x = long, y = lat), colour="yellow") +
  • 8. ggplot2::geom_point(data = zipcode_31_40_loc, aes(x = long, y = lat), colour="green") + ggplot2::geom_point(data = zipcode_41_50_loc, aes(x = long, y = lat), colour="blue") # zipcode group ì´ëŸ°ì‹ìœ¼ë¡œ ì§‘ì„ ggmap ê³¼ ggplot2 패키지를 ì´ìš© ê·¸ë¦¼ì— í‘œí˜„í•´ì¤Œ zipcode__1_loc <- kc_house[kc_house$zipcode == grep("1$", kc_house$zipcode, value = TRUE), c("long", "lat")] zipcode__2_loc <- kc_house[kc_house$zipcode == grep("2$", kc_house$zipcode, value = TRUE), c("long", "lat")] zipcode__3_loc <- kc_house[kc_house$zipcode == grep("3$", kc_house$zipcode, value = TRUE), c("long", "lat")] house_map + ggplot2::geom_point(data = zipcode__1_loc, aes(x = long, y = lat), colour="red") + ggplot2::geom_point(data = zipcode__2_loc, aes(x = long, y = lat), colour="orange") + ggplot2::geom_point(data = zipcode__3_loc, aes(x = long, y = lat), colour="yellow")
  • 9. (2) 채íƒí•œ ë…립변수 - room_newnum : ì§‘ì˜ í¬ê¸°ì— ì˜í–¥ì„ 미치는 변수들(bathrooms, bedrooms, floors, sqft_living) ì„ í•˜ë‚˜ì˜ ë³€ìˆ˜ë¡œ 모ë¸ë§ - waterfront : 해안가(waterfront) 여부 - view : ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수 - grade : ì§‘ì— ëŒ€í•œ ì „ë°˜ì ì¸ í‰ê°€ì²™ë„ - X1 ~ X12 : priceê°€ 비슷한 zipcode 별 groupí™” i. room_newnum : ì§‘ì˜ í¬ê¸°ì— ì˜í–¥ì„ 미치는 변수들(bathrooms, bedrooms, floors, sqft_living)ì„ í•˜ë‚˜ì˜ ë³€ìˆ˜ë¡œ 모ë¸ë§ - 분ì„방법 : ìƒê´€ë¶„ì„ # bedrooms, bathrooms, floors ë°ì´í„°ëª¨ë¸ë§í•˜ê¸° cor(kc_house$bedrooms, kc_house$price) # 0.308 cor(kc_house$bathrooms, kc_house$price) # 0.525 cor(kc_house$floors, kc_house$price) # 0.257 cor(kc_house$sqft_living, kc_house$price) # 0.702 cor(kc_house$sqft_lot, kc_house$price) # 0.090 # 제외! kc_house_data$room_newnum = kc_house$bedrooms*0.308+ kc_house$bathrooms*0.525+ kc_house$floors*0.257+ kc_house$sqft_living*0.702 <ë° ì´í„° 모ë¸ë§ ì‹> kc_house_data$room_newnum = kc_house$bedrooms*0.308 + kc_house$bathrooms*0.525 + kc_house$floors*0.257 + kc_house$sqft_living*0.702 cor(kc_house$room_newnum, kc_house$price) # 0.702 ii. waterfront : 해안가(waterfront) 여부 - 분ì„방법 : 양측 가설검정 (t-test) # 해안가(=1) group ì˜ ìœ„ì¹˜ waterfront_T <- kc_house[kc_house$waterfront ==1, c("long", "lat")] # 해안가가 ì•„ë‹Œ(=0) group ì˜ ìœ„ì¹˜ waterfront_F <- kc_house[kc_house$waterfront ==0, c("long", "lat")] house_map <- get_googlemap(center = c(lon =-122.1, lat =47.5), zoom =10) %>% ggmap house_map + ggplot2::geom_point(data = waterfront_T, aes(x = long, y = lat), colour="red") + ggplot2::geom_point(
  • 10. data = waterfront_F, aes(x = long, y = lat), colour="orange") # 해안가(=1) group ì˜ price waterfront_T_price <- kc_house[kc_house$waterfront ==1, "price"] # 해안가가 ì•„ë‹Œ(=0) group ì˜ price waterfront_F_price <- kc_house[kc_house$waterfront ==0, "price"] summary(waterfront_T_price[[1]]) summary(waterfront_F_price[[1]]) t.test(waterfront_T_price[[1]], waterfront_F_price[[1]]) boxplot(waterfront_T_price[[1]], waterfront_F_price[[1]]) ê²°ê³¼ Welch Two Sample t-test data: waterfront_T_price[[1]] and waterfront_F_price[[1]] t = 12.876, df = 162.23, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 956963.3 1303661.6 sample estimates: mean of x mean of y 1661876.0 531563.6 귀무가설 : ë‘ ê·¸ë£¹ì˜ í‰ê· ì€ 같다 대립가설 : ë‘ ê·¸ë£¹ì˜ í‰ê· ì€ 같지 않다 p-value < 0.000 ì´ë¯€ë¡œ, 유ì˜ìˆ˜ì¤€ a=0.05ì—ì„œ ê·€ë¬´ê°€ì„¤ì„ ê¸°ê°í•œë‹¤. 즉, í•´ì•ˆê°€ì— ìœ„ì¹˜í•œ 집들 ì˜ í‰ê·  집값과 í•´ì•ˆê°€ì— ìœ„ì¹˜í•˜ì§€ ì•Šì€ ì§‘ë“¤ì˜ í‰ê·  ì§‘ê°’ì´ ë‹¤ë¥´ë‹¤ê³  ë³¼ 수 있다. iii. view : ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수 - 분ì„방법 : 여러 집단 ê°„ì˜ ì°¨ì´ê²€ì •(ANOVA, scheffe.test) view0_price <- kc_house[kc_house$view ==0,"price"] length(view0_price[[1]]) # 19489 view1_price <- kc_house[kc_house$view ==1,"price"] length(view1_price[[1]]) # 332 view2_price <- kc_house[kc_house$view ==2,"price"] length(view2_price[[1]]) # 963 view3_price <- kc_house[kc_house$view ==3,"price"] length(view3_price[[1]]) # 510 view4_price <- kc_house[kc_house$view ==4,"price"] length(view4_price[[1]]) # 319
  • 11. mean(view0_price[[1]]) # 496564.2 mean(view1_price[[1]]) # 812280.8 mean(view2_price[[1]]) # 792400.9 mean(view3_price[[1]]) # 971965.3 mean(view4_price[[1]]) # 1463711 view_price <- c(view0_price[[1]], view1_price[[1]], view2_price[[1]], view3_price[[1]], view4_price[[1]]) length(view_price) # 21613 length(group) # 21613 group <- c(rep(0, 19489), rep(1, 332), rep(2, 963), rep(3, 510), rep(4, 319)) cbind(view_price, group) boxplot(view_price ~ group, ylab = "Price", xlab = "View") describe.by(view_price, group) # 그룹별기술통계량계산 # mad ANO_R <- aov(view_price ~ group) anova(ANO_R) library(agricolae) scheffe.test(ANO_R, "group", alpha =0.05, console = TRUE) LSD.test(ANO_R, "group", alpha =0.05, console = TRUE) duncan.test(ANO_R, "group", alpha =0.05, console = TRUE) ê²°ê³¼ > head( cbind(view_price, group) ) view_price group [1,] 221900 0 [2,] 538000 0 [3,] 180000 0 [4,] 604000 0 [5,] 510000 0 [6,] 1225000 0 > tail( cbind(view_price, group) ) view_price group [21608,] 580000 4 [21609,] 2300000 4 [21610,] 1149000 4 [21611,] 900000 4 [21612,] 2230000 4 [21613,] 3567000 4
  • 12. > describe.by(view_price, group) # 그룹별 기술통계량 계산 # mad Descriptive statistics by group group: 0 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 19489 496564.2 287133.3 432500 456422 203857.5 75000 5570000 5495000 3.11 21.58 2056.78 ------------------------------------------------------------------------------ group: 1 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 332 812280.8 510949.7 690944 722550.6 308714.4 217000 3650000 3433000 2.26 6.65 28042.01 ------------------------------------------------------------------------------ group: 2 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 963 792400.9 510105 675000 714267.2 318759 169317 7062500 6893183 3.57 27.65 16437.91
  • 13. ------------------------------------------------------------------------------ group: 3 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 510 971965.3 612692.2 802500 892245.3 450710.4 154000 7700000 7546000 3.38 28.12 27130.47 ------------------------------------------------------------------------------ group: 4 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 319 1463711 952209.6 1185000 1320949 667170 252000 6885000 6633000 1.84 4.87 53313.5 > scheffe.test(ANO_R, "group", alpha = 0.05, console = TRUE) Study: ANO_R ~ "group" Scheffe Test for view_price Mean Square Error : 113513294940 group, means view_price std r Min Max 0 496564.2 287133.3 19489 75000 5570000 1 812280.8 510949.7 332 217000 3650000 2 792400.9 510105.1 963 169317 7062500 3 971965.3 612692.2 510 154000 7700000 4 1463711.2 952209.6 319 252000 6885000 Alpha: 0.05 ; DF Error: 21611 Critical Value of F: 2.372343 Harmonic Mean of Cell Sizes 543.6342 Minimum Significant Difference: 62951.16 Means with the same letter are not significantly different.
  • 14. Groups, Treatments and means a 4 1464000 b 3 972000 c 1 812300 c 2 792400 d 0 496600 ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수가 1ë²ˆì¸ ê²½ìš°ì™€ 2ë²ˆì¸ ê²½ìš°ì˜ í‰ê· ì€ 유ì˜ë¯¸í•œ ì°¨ì´ê°€ 발견ë˜ì§€ ì•Šì•„, 비슷 í•œ 그룹ì´ë¼ê³  ë´ë„ 무방하다. ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수가 많ì„ìˆ˜ë¡ ì§‘ì˜ í‰ê· ê°’ì´ ë†’ì•„ì§€ëŠ” ê²½í–¥ì´ ìžˆë‹¤. iv. grade : ì§‘ì— ëŒ€í•œ ì „ë°˜ì ì¸ í‰ê°€ì²™ë„ - 분ì„방법 : ìƒê´€ê´€ê³„ ë¶„ì„ > cor.test(kc_house$grade, kc_house$price) Pearson's product-moment correlation data: kc_house$grade and kc_house$price t = 131.76, df = 21611, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.6599749 0.6747621 sample estimates: cor 0.6674343 p-value < 2.2e-16 ì´ë¯€ë¡œ ìƒê´€ê´€ê³„ 분ì„ì— ëŒ€í•œ 결과가 유ì˜ë¯¸í•˜ë‹¤. 즉, grade변수와 price ë³€ 수 사ì´ì— ìƒê´€ê³„수는 0.667 ì´ë©°, 비êµì  ê°•í•œ ì–‘ì˜ ìƒê´€ê´€ê³„ê°€ 있다고 ë³¼ 수 있다. plot(kc_house$grade, kc_house$price, ylab = "Price", xlab = "Grade")
  • 15. v. X1 ~ X12 : priceê°€ 비슷한 zipcode 별 groupí™” # zipcode 별 price ì˜í‰ê· ì„ group í™” kc_house_DT <- as.data.table(kc_house) zipcode_price_group <- kc_house_DT[ , list(n = .N, Mean = mean(price)), by = list(zipcode, school_el, school_mi, school_hi, school_to)] head(zipcode_price_group) zipcode_price_group[sort(zipcode_price_group$Mean, decreasing = TRUE),c("zipcode","Mena")] zipcode_price_group_DT <- as.data.table(zipcode_price_group) zipcode_price_group_DT <- zipcode_price_group_DT[order(Mean, decreasing = TRUE) , ] summary(zipcode_price_group_DT) # Mean # Min. : 234284 # 1st Qu.: 354126 # Median : 491952 # Mean : 560774 # 3rd Qu.: 645438 # Max. : 2160607 zipcode_group_1 <- zipcode_price_group_DT[1, "zipcode"] zipcode_group_2 <- zipcode_price_group_DT[2, "zipcode"] zipcode_group_3 <- zipcode_price_group_DT[3, "zipcode"]
  • 16. zipcode_group_4 <- zipcode_price_group_DT[4, "zipcode"] zipcode_group_5 <- zipcode_price_group_DT[5, "zipcode"] zipcode_group_6 <- zipcode_price_group_DT[6:11, "zipcode"] zipcode_group_7 <- zipcode_price_group_DT[12:13, "zipcode"] zipcode_group_8 <- zipcode_price_group_DT[14:25, "zipcode"] zipcode_group_9 <- zipcode_price_group_DT[26:34, "zipcode"] zipcode_group_10 <- zipcode_price_group_DT[35:48, "zipcode"] zipcode_group_11 <- zipcode_price_group_DT[49:61, "zipcode"] zipcode_group_12 <- zipcode_price_group_DT[62:70, "zipcode"] dummies <- data.frame(matrix(nrow = nrow(kc_house), ncol =12)) dummies[,1] <- ifelse(kc_house$zipcode == zipcode_group_1, 1, 0) dummies[,2] <- ifelse(kc_house$zipcode == zipcode_group_2, 1, 0) dummies[,3] <- ifelse(kc_house$zipcode == zipcode_group_3, 1, 0) dummies[,4] <- ifelse(kc_house$zipcode == zipcode_group_4, 1, 0) dummies[,5] <- ifelse(kc_house$zipcode == zipcode_group_5, 1, 0) dummies[,6] <- ifelse(kc_house$zipcode == zipcode_group_6, 1, 0) dummies[,7] <- ifelse(kc_house$zipcode == zipcode_group_7, 1, 0) dummies[,8] <- ifelse(kc_house$zipcode == zipcode_group_8, 1, 0) dummies[,9] <- ifelse(kc_house$zipcode == zipcode_group_9, 1, 0) dummies[,10] <- ifelse(kc_house$zipcode == zipcode_group_10, 1, 0) dummies[,11] <- ifelse(kc_house$zipcode == zipcode_group_11, 1, 0) dummies[,12] <- ifelse(kc_house$zipcode == zipcode_group_12, 1, 0) for(i in1:12){ dummies[,i] <- ifelse(kc_house$cluster == zipcode_group_i, 1, 0) } kc_house_data <- cbind(kc_house, dummies) ê²°ê³¼ > head(zipcode_price_group) zipcode school_el school_mi school_hi school_to n Mean 1: 98001 16 4 6 26 362 280804.7 2: 98002 7 4 3 14 199 234284.0 3: 98003 10 6 5 21 280 294111.3 4: 98004 5 3 4 12 317 1355927.1 5: 98005 5 3 5 13 168 810164.9 6: 98006 9 3 2 14 498 859684.8
  • 17. (3) 채íƒí•˜ì§€ 못한 ë…립변수 - house_age : 주íƒì´ ì˜¤ëž˜ëœ ì •ë„ (2017 – yr_built) - renovated_TF : renovateì˜ ì—¬ë¶€ - school_to : zipcode group 별 í•™êµìˆ˜ - season_price : ì§‘ì´ íŒ”ë¦° 계절별(ë´„/여름/ê°€ì„/겨울) i. house_age : 주íƒì´ ì˜¤ëž˜ëœ ì •ë„ (2017 – yr_built) - 분ì„방법 : ìƒê´€ê´€ê³„ ë¶„ì„ summary(kc_house$yr_built) # Min. 1st Qu. Median Mean 3rd Qu. Max.
  • 18. # 1900 1951 1975 1971 1997 2015 house_age =2017- kc_house$yr_built summary(house_age) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 2.00 20.00 42.00 45.99 66.00 117.00 cor(house_age, kc_house$price) -0.054 cor.test(house_age, kc_house$price) plot(house_age, kc_house$price) ê²°ê³¼ Pearson's product-moment correlation data: house_age and kc_house$price t = -7.9517, df = 21611, p-value = 1.93e-15 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: -0.06729506 -0.04070886 sample estimates: cor -0.05401153 p-valueê°€ 0.000ê°’ì´ë¯€ë¡œ 유ì˜ìˆ˜ì¤€ a=0.05ì—ì„œ ìƒê´€ê³„ìˆ˜ì˜ ê°’ì€ ìœ ì˜ë¯¸í•˜ë‹¤. ë”°ë¼ì„œ ì§‘ì´ ì§€ì–´ 진 ì—°ë„와 price와 ìƒê´€ê´€ê³„ê°€ ê±°ì˜ ì—†ë‹¤ê³  ë³¼ 수 있다. ii. renovated_TF : renovateì˜ ì—¬ë¶€ - 분ì„방법 : 양측 가설검정 (t-test) # renovate ëœ group ì˜ ì§‘ê°’ì´ renovate ì•ˆëœ group ì˜ ì§‘ê°’ë³´ë‹¤ 높다 renovate_house <- kc_house[kc_house$yr_renovated !=0, "price"] # renovate ëœ ì§‘ë“¤ì˜ê°€ê²© not_renovate_house <- kc_house[kc_house$yr_renovated ==0, "price"] # renovate ì•ˆëœ ì§‘ë“¤ì˜ê°€ê²© t.test(renovate_house[[1]], not_renovate_house[[1]]) # p-value < 2.2e-16 ì´ ë¯€ë¡œ ë‘ ê·¸ë£¹ì˜ í‰ê· ì€ ê°™ 지 않다. (H0 기 ê°)
  • 19. # mean of x mean of y # 760379.0 530360.8 Welch Two Sample t-test ê²°ê³¼ > data: renovate_house[[1]] and not_renovate_house[[1]] t = 11.36, df = 939.86, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 190280.9 269755.5 sample estimates: mean of x mean of y 760379.0 530360.8 p-valueê°’ì´ 0.000ì´ë¯€ë¡œ t.test ê²°ê³¼, renovateëœ ì§‘ê³¼ renovateê°€ ë˜ì§€ ì•Šì€ ì§‘ 사ì´ì˜ í‰ê·  ê°’ì—는 ì°¨ì´ê°€ 나타났다. 하지만 회귀검정 과정ì—ì„œ ì´ ë³€ìˆ˜ë¥¼ ì œì™¸í•˜ì˜€ì„ ë•Œì˜ R-squareê°’ì´ ì´ ë³€ìˆ˜ë¥¼ ì¶”ê°€í•˜ì˜€ì„ ë•Œ ì˜ R-square값보다 높았기 ë•Œë¬¸ì— íšŒê·€ëª¨í˜• 변수로 채íƒí•˜ì§€ 않았다. iii. school_to : zipcode group 별 í•™êµìˆ˜ kc_house <- readxl::read_excel(path = "kc_house_data.xlsx", sheet = 1, col_names = TRUE) zipcode_school <- readxl::read_excel(path = "zipcode_school.xlsx", sheet = 1, col_names = TRUE) str(zipcode_school) head(zipcode_school) kc_house_DT <- as.data.table(kc_house) # ì›ë°ì´í„° kc_house 와 zipcode_school ë°ì´í„°ë¥¼ join í•´ì„œ kc_house ì— ë„£ì–´ì¤Œ kc_house <- merge(kc_house, zipcode_school, by = "zipcode", all = TRUE) View(kc_house) # í•™êµìˆ˜ 내림차순으로 zipcode ì •ë ¬ x <- kc_house[order(kc_house$school_to, decreasing = TRUE) , c("zipcode","school_to")] unique(x) # 중복값 제거 # price 내림차순으로 zipcode ì •ë ¬ kc_house[order(kc_house$price, decreasing = TRUE) , c("price","zipcode")]
  • 20. cor(zipcode_price_group[,2][[1]], zipcode_price_group[,7][[1]]) # el - price # -0.2 cor(zipcode_price_group[,3][[1]], zipcode_price_group[,7][[1]]) # mi - price # -0.2 cor(zipcode_price_group[,4][[1]], zipcode_price_group[,7][[1]]) # hi - price # -0.3 cor(zipcode_price_group[,5][[1]], zipcode_price_group[,7][[1]]) # to - price # -0.3 cor.test(zipcode_price_group[,2][[1]], zipcode_price_group[,7][[1]]) # el - price # -0.2 cor.test(zipcode_price_group[,3][[1]], zipcode_price_group[,7][[1]]) # mi - price # -0.2 cor.test(zipcode_price_group[,4][[1]], zipcode_price_group[,7][[1]]) # hi - price # -0.3 cor.test(zipcode_price_group[,5][[1]], zipcode_price_group[,7][[1]]) # to - price # -0.3 í•™êµ ìˆ˜ê°€ 많ì„ìˆ˜ë¡ ì§‘ê°’ì´ ë‹¤ì†Œ 하ë½í•˜ëŠ” ê²½í–¥ì„ ë³´ì¸ë‹¤. 하지만 ìƒê´€ê´€ê³„ê°€ 0.2~0.3 으로 낮게 나왔기 ë•Œë¬¸ì— ì´ˆë“±í•™êµ, 중학êµ, 고등학êµ, ì „ì²´ í•™êµì˜ 숫ìžëŠ” price ì— í° ì˜í–¥ì„ 미치지 않는다고 ë³¼ 수 있다. iv. season_price : ì§‘ì´ íŒ”ë¦° 계절별(ë´„/여름/ê°€ì„/겨울) - 분ì„방법 : 여러집단 사ì´ì˜ ì°¨ì´ê²€ì • (ANOVA, scheffe, test) head(date_numeric) # 새로운변수ìƒì„± kc_house$date_numeric = date_numeric # 집ì´íŒ”린날짜가 # ë´„(3,4,5 ì›”) / 여름(6,7,8 ì›”) / ê°€ì„(9,10,11 ì›”) / 겨울(12,1,2 ì›”) 별로 price 그룹화 spr_price <- kc_house[grep("....03..|....04..|....05..", date_numeric), "price"] sum_price <- kc_house[grep("....06..|....07..|....08..", date_numeric), "price"] fal_price <- kc_house[grep("....09..|....10..|....11..", date_numeric), "price"] win_price <- kc_house[grep("....12..|....01..|....02..", date_numeric), "price"] mean(spr_price[[1]]) mean(sum_price[[1]]) mean(fal_price[[1]]) mean(win_price[[1]]) length(spr_price[[1]]) + length(sum_price[[1]]) + length(fal_price[[1]]) + length(win_price[[1]]) # 21613 "전체를다가져왔는지확ì¸" OK seson_price <- c(spr_price[[1]], sum_price[[1]], fal_price[[1]], win_price[[1]]) group <- c(rep(1, length(spr_price[[1]])), rep(2, length(sum_price[[1]])), rep(3, length(fal_price[[1]])), rep(4, length(win_price[[1]])))
  • 21. length(group) # 21613 cbind(seson_price, group) boxplot(seson_price ~ group) describe.by(seson_price, group) # 그룹별기술통계량계산 ANO_R<-aov(seson_price ~ group) anova(ANO_R) scheffe.test(ANO_R, "group", alpha =0.05, console = TRUE) LSD.test(ANO_R, "group", alpha =0.05, console = TRUE) duncan.test(ANO_R, "group", alpha =0.05, console = TRUE) t.test(fal_price[[1]], win_price[[1]]) ê²°ê³¼ > head( cbind(seson_price, group) ) seson_price group [1,] 538000 1 [2,] 180000 1 [3,] 310000 1 [4,] 530000 1 [5,] 650000 1 [6,] 485000 1 > tail( cbind(seson_price, group) ) seson_price group [21608,] 330000 4 [21609,] 230000 4 [21610,] 645000 4 [21611,] 414500 4 [21612,] 347500 4 [21613,] 350000 4
  • 22. > describe.by(seson_price, group) # 그룹별 기술통계량 계산 Descriptive statistics by group group: 1 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 6520 543036.7 363293.6 455700 486463.7 228765.2 78000 7062500 6984500 4 34.18 4499.19 ------------------------------------------------------------------------------ group: 2 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 6331 543183.9 377206.2 450000 480799 222390 75000 5570000 5495000 3.72 24.97 4740.7 ------------------------------------------------------------------------------ group: 3 vars n mean sd median trimmed mad min max range skew kurtosis se
  • 23. X1 1 5063 536213 365608 450000 480103.4 222390 82500 7700000 7617500 5.02 60.6 5138.21 ------------------------------------------------------------------------------ group: 4 vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 3699 534896.4 358372.5 447500 477146.5 218683.5 83000 3800000 3717000 3.22 16.54 5892.4 > anova(ANO_R) Analysis of Variance Table Response: seson_price Df Sum Sq Mean Sq F value Pr(>F) group 1 2.4117e+11 2.4117e+11 1.7894 0.181 Residuals 21611 2.9127e+15 1.3478e+11 > scheffe.test(ANO_R, "group", alpha = 0.05, console = TRUE) Study: ANO_R ~ "group" Scheffe Test for seson_price Mean Square Error : 134777455496 group, means seson_price std r Min Max 1 543036.7 363293.6 6520 78000 7062500 2 543183.9 377206.2 6331 75000 5570000 3 536213.0 365608.0 5063 82500 7700000 4 534896.4 358372.4 3699 83000 3800000 Alpha: 0.05 ; DF Error: 21611 Critical Value of F: 2.60532 Harmonic Mean of Cell Sizes 5133.59
  • 24. Minimum Significant Difference: 20258.36 Means with the same letter are not significantly different. Groups, Treatments and means a 2 543200 a 1 543000 a 3 536200 a 4 534900 계절(ë´„/여름/ê°€ì„/겨울)ì— ë”°ë¥¸ ê·¸ë£¹ì˜ í‰ê· ì—ì„œ 유ì˜ë¯¸í•œ ì°¨ì´ê°€ 나타나지 않았다. ë”°ë¼ì„œ, 계 ì ˆì— ë”°ë¥¸ priceì—는 ì°¨ì´ê°€ 없다고 ë³¼ 수 있다. 3. ìžë£Œ ë¶„ì„ A. ë¶„ì„ ë°©ë²• i. 회귀 모형 ii. íšŒê·€ëª¨í˜•ì— ë“¤ì–´ê°ˆ ë°ì´í„° 전처리 # date 변수를 numeric 으로 바꾸기 date_numeric <- substr(kc_house_data$date, 1, 8) # 20141124 형태로 ì—°ë„날짜 문ìžë¶€ë¶„만 추출 date_numeric <- as.numeric(date_numeric) is.numeric(date_numeric) # TRUE kc_house_data$date <- date_numeric # ê¸°ì¡´ì˜ date 를 새로운 numeric date ë¡œ 대체 집값 ~ í¬ê¸°ë¥¼ 나타내는 변수(bathsrooms, bedrooms, floors, sqft_living) + 해안가 여부(waterfront _ T/F) + ì§‘ì„ ë³´ëŸ¬ì˜¨ 횟수(view _ 0/1/2/3/4) + 집 í‰ê°€ì²™ë„(grade) + zipcode group(ì´ 12ê°œ)ì— ëŒ€í•œ ë”미변수 house_lm = lm(kc_house_data$price ~ room_newnum + waterfront + view + grade + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12, data = kc_house_data)
  • 25. # í¬ê¸°ë¥¼ 나타내는 ë³€ìˆ˜ë“¤ì„ ë¬¶ì–´ì£¼ê¸° kc_house_data$room_newnum = kc_house$bedrooms*0.308 + kc_house$bathrooms*0.525 + kc_house$floors*0.257 + kc_house$sqft_living*0.702 cor(kc_house$room_newnum, kc_house$price) # 0.702 View(kc_house_data) iii. zipcode 별로 priceì— ëŒ€í•œ 정규성 검정 # by(A, B, shapiro.test) # Bì— ìžˆëŠ” 모든 ì§‘ë‹¨ì˜ Aê°’ì— ëŒ€í•´ 정규성 ê²€ì •ì„ í•œë‹¤ A <- by(kc_house$price, kc_house$zipcode, shapiro.test) ## p-value > 0.05ì¸ zipcode group ### # kc_house$zipcode: 98002 # # Shapiro-Wilk normality test # # data: dd[x, ] # W = 0.99639, p-value = 0.9243 ### # kc_house$zipcode: 98108 # # Shapiro-Wilk normality test # # data: dd[x, ] # W = 0.99176, p-value = 0.3707 ### B. ë¶„ì„ ê²°ê³¼ i. Stepwiseì— ì˜í•œ ì˜í–¥ë ¥ì´ 있는 ë³€ìˆ˜ì˜ ì„ ë³„ . Call: lm(formula = kc_house_data$price ~ room_newnum + waterfront + view + grade + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12, data = kc_house_data)
  • 26. Residuals: Min 1Q Median 3Q Max -1168280 -111183 -11201 93153 4695648 (2) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -428984.61 11000.94 -38.995 < 2e-16 *** room_newnum 219.91 3.37 65.248 < 2e-16 *** waterfront 596633.57 17493.97 34.105 < 2e-16 *** view 73086.65 2054.59 35.572 < 2e-16 *** grade 78201.44 1841.75 42.460 < 2e-16 *** X1 1210393.45 28972.94 41.777 < 2e-16 *** X2 631557.38 11627.49 54.316 < 2e-16 *** X3 359493.57 12361.11 29.083 < 2e-16 *** X4 467049.42 12554.92 37.200 < 2e-16 *** X5 335753.53 19958.60 16.822 < 2e-16 *** X6 187139.64 12507.13 14.963 < 2e-16 *** X7 118286.47 11234.48 10.529 < 2e-16 *** X8 64278.26 11002.59 5.842 5.23e-09 *** X9 85789.09 10854.34 7.904 2.84e-15 *** X10 -32043.68 12036.78 -2.662 0.00777 ** X11 -103019.44 10948.51 -9.409 < 2e-16 *** X12 -152845.27 13182.95 -11.594 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 203600 on 21596 degrees of freedom Multiple R-squared: 0.6926, Adjusted R-squared: 0.6924 (1) F-statistic: 3041 on 16 and 21596 DF, p-value: < 2.2e-16 ii. íšŒê·€ë¶„ì„ ê²°ê³¼ ë¶„ì„ 1단계 : íšŒê·€ëª¨í˜•ì€ í†µê³„ì ìœ¼ë¡œ 타당한가? 귀무가설 : íšŒê·€ëª¨í˜•ì€ íƒ€ë‹¹í•˜ì§€ 않다. 대립가설 : íšŒê·€ëª¨í˜•ì€ íƒ€ë‹¹í•˜ë‹¤.
  • 27. (1) F-statistic: 3041 on 16 and 21596 DF, p-value: < 2.2e-16 (1)ì˜ ì¶œë ¥ ê²°ê³¼ë¬¼ì„ ë³´ë©´ p-value ê°’ì´ 0.000 ì´ë¯€ë¡œ ê·€ë¬´ê°€ì„¤ì„ ê¸°ê°í•œë‹¤. 1ë‹¨ê³„ì˜ ê²°ë¡  : 대립가설, íšŒê·€ëª¨í˜•ì€ íƒ€ë‹¹í•˜ë‹¤ 2단계 : ë…립변수 ê°ê°ì€ 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 주는가? 귀무가설 : ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 주지 않는다. 대립가설 : ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 준다. (2) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -428984.61 11000.94 -38.995 < 2e-16 *** room_newnum 219.91 3.37 65.248 < 2e-16 *** waterfront 596633.57 17493.97 34.105 < 2e-16 *** view 73086.65 2054.59 35.572 < 2e-16 *** grade 78201.44 1841.75 42.460 < 2e-16 *** X1 1210393.45 28972.94 41.777 < 2e-16 *** X2 631557.38 11627.49 54.316 < 2e-16 *** X3 359493.57 12361.11 29.083 < 2e-16 *** X4 467049.42 12554.92 37.200 < 2e-16 *** X5 335753.53 19958.60 16.822 < 2e-16 *** X6 187139.64 12507.13 14.963 < 2e-16 *** X7 118286.47 11234.48 10.529 < 2e-16 *** X8 64278.26 11002.59 5.842 5.23e-09 *** X9 85789.09 10854.34 7.904 2.84e-15 *** X10 -32043.68 12036.78 -2.662 0.00777 ** X11 -103019.44 10948.51 -9.409 < 2e-16 *** X12 -152845.27 13182.95 -11.594 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (2)ì˜ ì¶œë ¥ ê²°ê³¼ë¬¼ì„ ë³´ë©´ 유ì˜í™•ë¥ (Pr(>|t|))ì´ 0.000 ì— ê°€ê¹Œìš°ë¯€ë¡œ ê·€ë¬´ê°€ì„¤ì„ ê¸°ê°í•˜ì—¬, 유ì˜ìˆ˜ì¤€ì¸ 0.05 구간ì—ì„œ ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 준다는 ëŒ€ë¦½ê°€ì„¤ì„ ì±„íƒí•œë‹¤. 2ë‹¨ê³„ì˜ ê²°ë¡  : ë…립변수는 종ì†ë³€ìˆ˜ì—게 ì˜í–¥ì„ 준다. ë³€ìˆ˜ì„ íƒ : 단계 ì„ íƒë²•(Stepwise Selection)방법 ì„ íƒ step(회귀분ì„결과물, direction = c("forward", "backward", "both")) model.stepwise = step(house_lm, direction = "both") summary(model.stepwise)
  • 28. 3단계 : ë³€ìˆ˜ì„ íƒ (결론출력) ë…립변수ì˜íšŒê·€ê³„수(coefficient of Regression) : 0.6926 4단계 : 회귀모형ì˜ì„¤ëª…ë ¥ = ë…립변수ì˜ì„¤ëª… Multiple R-squared : 0.6926 0.6926 * 100 = 69.2% ì„¤ëª…ê³„ìˆ˜ì¸ R-squared 를 출력한 결과가 price ì˜ ë‹¤ë¦„ì„ ì•½ 69.2%를 설명한다. 단, 고려해야할 ì ì´ 있다. 만약, ìµœì¢…íšŒê·€ëª¨í˜•ì— ë…립변수가 2ê°œì´ìƒ í¬í•¨ì´ ë˜ë©´ 첫째, íšŒê·€ê³„ìˆ˜ì˜ í•´ì„ì„ í™•ì¸í•´ì•¼ 한다. ë…립변수1ì€ ë‚˜ë¨¸ì§€ ë…ë¦½ë³€ìˆ˜ë“¤ì´ ê³ ì •ë˜ì–´ ìžˆì„ ë•Œì—(통제) 1ì˜ê¸°ë³¸ë‹¨ìœ„ê°€ 1 ì¦ê°€í•˜ë©´ 종ì†ë³€ìˆ˜ëŠ” 약 얼마 ì¦ê°€/ê°ì†Œí•˜ëŠ”지 고려해야 한다. 둘째, 다중공선성(Multicollinearity)ì„ í™•ì¸í•´ì•¼ 한다. ë…ë¦½ë³€ìˆ˜ë“¤ê°„ì˜ ì„ í˜•ì˜ ê´€ê³„ëŠ” 없어야 한다. 예를 들어, VIF(Varaince Inflation Factor) : 10 ì´ìƒì´ë©´ ë‹¤ì¤‘ê³µì„ ì„±ì´ ì¡´ìž¬í•œë‹¤ê³  íŒë‹¨í•˜ë©°, ë…립변수들 ê°„ì— ì„ í˜•ì˜ ê´€ê³„ê°€ ì¡´ìž¬í•¨ì„ ì•Œ 수 있다. 만약 ì´ëŸ¬í•œ 결과가 나온다면 ë…립변수들 ì¤‘ì— ë¹¼ëŠ” ê²ƒì„ ê²€í† í•œë‹¤. 다중공선성(Multicollinearity)ì„ í™•ì¸ library(car) car::vif(model.stepwise) > car::vif(model.stepwise) room_newnum waterfront view grade X1 X2 X3 X4 2.465253 1.194046 1.292139 2.442943 1.009935 1.018493 1.025663 1.009917 X5 X6 X7 X8 X9 X10 X11 X12 1.003873 1.005920 1.012795 1.005337 1.003162 1.003162 1.003869 1.002985 모든 ë³€ìˆ˜ë“¤ì— ëŒ€í•œ VIF ê°’ì´ 10 ì´í•˜ì´ë¯€ë¡œ ë…립변수 ê°„ì˜ ë‹¤ì¤‘ê³µì„ ì„±ì€ ì¡´ìž¬í•˜ì§€ 않는다. íšŒê·€ëª¨í˜•ì˜ ì„¤ëª…ë ¥ : Adjusted R-Square Adjusted R-squared: 0.6924 ë…ë¦½ë³€ìˆ˜ë“¤ì˜ ì˜í–¥ë ¥ í¬ê¸° ë¹„êµ library(lm.beta) lm.beta::lm.beta(attitude.lm) > lm.beta::lm.beta(model.stepwise)
  • 29. Call: lm(formula = kc_house_data$price ~ room_newnum + waterfront + view + grade + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12, data = kc_house_data) Standardized Coefficients:: (Intercept) room_newnum waterfront view grade X1 X2 X3 0.00000000 0.38651375 0.14060267 0.15255635 0.25038342 0.15839614 0.20680922 0.11112192 X4 X5 X6 X7 X8 X9 X10 X11 0.14104423 0.06359071 0.05661791 0.03997667 0.02209983 0.02986607 -0.01005961 - 0.03556859 X12 -0.04380771 4. 예측 kc_house_final <- kc_house_data[,c("room_newnum","waterfront","view","grade","X1","X2","X3","X4","X5","X6","X7","X 8","X9","X10","X11","X12")] # predict(회귀분ì„ê²°ê³¼, newdata = data.frame(complaints = )) predict(model.stepwise, newdata = data.frame(kc_house_final[1:5,]), interval = "predict") View(kc_house_data) 1) 예측값 > predict(model.stepwise, newdata = data.frame(kc_house_final[1:6,]), interval = "predict") # ì ì¶” ì •? fit lwr upr 1 219839.6 -186103.7 625782.8 2 285515.2 -120405.3 691435.7 3 268629.0 -137296.2 674554.2 4 493984.5 87995.8 899973.1
  • 30. 2) 실제값 zipcode id date price 1 98001 302000375 20140814 169100 2 98001 6181400920 20150430 286651 3 98001 2005950050 20140527 260000 4 98001 8956200070 20140905 447500 5. 최종 ê²°ë¡  í‘œì¤€í™”ëœ íšŒê·€ê³„ìˆ˜ì— ì˜í•´ì„œ, priceì— ê°€ìž¥ ì˜í–¥ì„ 미치는 변수는 ì§‘ì˜ í¬ê¸°ì™€ ê´€ë ¨ëœ ë³€ 수들(화장실수, 침실수, 층수, í‰ìˆ˜)와 gradeë¼ëŠ” ê²ƒì„ ì•Œ 수 있다. 변수명 변수 설명 í‘œì¤€í™”ëœ ìƒê´€ê³„수 room_newnum ì§‘ì˜ í¬ê¸°ì™€ ê´€ë ¨ëœ ë³€ìˆ˜ë“¤ì„ í•œê°€ì§€ ì˜ ë³€ìˆ˜ë¡œ 모ë¸ë§ í•œ 변수 (화장실수, 침실수, 층수, í‰ìˆ˜) 0.387 grade Kingcounty grading systemì— ì˜í•œ ì§‘ì— ëŒ€í•œ í‰ê°€ì²™ë„ 0.250 6. ë¯¸ë¹„ì  ë° ê°œì„ ë°©í–¥ - 집 í¬ê¸°ì— ì˜í–¥ì„ 받는 변수들(bathrooms, bedrooms, floors, sqft_living)ì„ í•˜ë‚˜ì˜ ë°ì´í„°ë¡œ ëª¨ë¸ ë§í•˜ëŠ” 과정ì—ì„œ 가중치 ì„ ì •ë°©ë²•ì´ ë¯¸í¡í–ˆë˜ 것 같다. ì´ëŸ¬í•œ ë³€ìˆ˜ë“¤ì´ ì‹¤ì œë¡œ ì§‘ê°’ì— ì˜í–¥ì„ 미치는 ì •ë„ì— ë”°ë¼ì„œ 가중치를 선정하려고 했으나, 논문ìžë£Œì—ì„œ ì´ì— 대한 정보를 찾지 못했다. 그래서 차선책으로 ê° ë³€ìˆ˜ë“¤ì— ëŒ€í•œ ìƒê´€ê³„수를 곱해서 ë”해주는 ë°©ì‹ì„ ì„ íƒí–ˆë‹¤. ì´ ë°©ë²•ì€ í˜„ìž¬ 주어진 ìžë£Œì—서만 유효하므로, 다ìŒì— 기회가 ëœë‹¤ë©´ ì „ì²´ ì§‘ê°’ì— ìœ ì˜ë¯¸í•œ 모ë¸ë§ì„ í•´ë³´ ê³  싶다. - 집 별로 í‰ìˆ˜ì˜ ì°¨ì´ê°€ ìžˆëŠ”ë° ë‹¨ìœ„ í‰ìˆ˜ë‹¹ ê°€ê²©ì„ ê³ ë ¤í•˜ì§€ ì•Šê³  단순히 zipcode별 ì§‘ê°’ì— ëŒ€ í•œ 분ì„ì„ ì§„í–‰í•˜ì˜€ë‹¤. ë”°ë¼ì„œ í‰ìˆ˜ë‹¹ 가격보다는 ì§€ì—­ì˜ ìš”ì¸ì— 중ì ì„ ë‘었으며, 누ë½ë˜ëŠ” 부분 ì´ ì¶©ë¶„ížˆ ìžˆì—ˆì„ ê²ƒì´ë¼ê³  íŒë‹¨ëœë‹¤.
  • 31. - 주어진 ë°ì´í„° ì´ì™¸ì— 새로운 ë³€ìˆ˜ë“¤ì„ ë§Œë“¤ì–´ë‚´ë ¤ê³  노력하였는ë°, ë§Žì€ ë³€ìˆ˜ë“¤(School, Seasonality 등)ì´ priceì— ìœ ì˜ë¯¸í•œ ì˜í–¥ì„ 주지 ì•Šì•„ 채íƒí•˜ì§€ 못했다. 변수를 세부그룹으로 나 누거나 ì´ìƒì¹˜ë¥¼ 제거하는 등 ë°ì´í„° 전처리 ê³¼ì •ì„ ê±°ì¹œë‹¤ë©´ 채íƒí•˜ì§€ 못한 변수들과 price 사 ì´ì— 유ì˜ë¯¸í•œ ìƒê´€ê´€ê³„를 ì°¾ì„ ìˆ˜ ìžˆì„ ê²ƒì´ë¼ê³  ìƒê°í•œë‹¤. ì „ì²´ì ìœ¼ë¡œ ë°ì´í„°ë¥¼ 전처리 하는 ê³¼ì •ì´ ë¶€ì¡±í•˜ì˜€ë˜ ì ì´ 아쉬웠다. - 관련 ë…¼ë¬¸ì„ ì°¾ì•„ë³´ë‹¤ê°€ ì§‘ê°’ì´ ë¹„ì‹¸ì§ˆìˆ˜ë¡ í™”ìž¥ì‹¤ìˆ˜, í‰ìˆ˜ 등과 ê°™ì€ ë¬¼ë¦¬ì ì¸ 특성보다 해안 ê°€, í•™êµ ìˆ˜, 백화ì ê³¼ì˜ 거리 ë“±ì˜ ìž…ì§€ì ì¸ íŠ¹ì„±ì— ë”ìš± ì˜í–¥ì„ ë§Žì´ ë°›ëŠ”ë‹¤ëŠ” 연구결과를 알게 ë˜ì—ˆë‹¤. ì§‘ê°’ì´ ë¹„ì‹¼ 집들과 비싸지 ì•Šì€ ì§‘ë“¤ì„ ê°ê° 분ì„í•´ì„œ 여러 ê°€ì§€ì˜ íšŒê·€ëª¨í˜•ì„ ì„ ì •í•´ 보는 ê²ƒë„ ì¢‹ì€ í”„ë¡œì íŠ¸ê°€ ë˜ì—ˆì„ 것 같다. 종ì†ë³€ìˆ˜ì— 대한 시계열 ìžë£Œ house_price_ts <- ts(kc_house$price, start = c(2014, 182), end = c(2015, 182), frequency = 365) plot(house_price_ts, ylab = "Kingcounty House Price", xlab = "Year.Month", xlim = c(2014.5, 2015.5)) par(mfrow = c(1,2)) acf(house_price_ts) # 파란색 ì ì„  ë°‘ì— ê·¸ëž˜í”„ê°€ 있어야 유ì˜í•œê²ƒì´ë‹¤! pacf(house_price_ts) ndiffs(house_price_ts) # 0 # 아주 강력한 function! auto.arima houseBest <- auto.arima(x = house_price_ts) houseBest par(mfrow = c(1,2)) library(scales) forecast(houseBest, h = 5) -> houseforecast # 80%, 95% ì‹ ë¢°êµ¬ê°„ì´ ê°™ì´ ë‚˜ì˜´ plot(houseforecast)