install.packages("foreign") # balik pro nahravani ruzných typu souboru STATA atd.
install.packages("nlme")
install.packages("lmtest")
library(foreign)
getwd()
## [1] "C:/Users/Lukas/Desktop/Ekonometrie/AKM/cviceni 2017/cviceni 7"
nemovitost=read.dta("C:/Users/Lukas/Documents/KIELMC.dta")
## Warning in read.dta("C:/Users/Lukas/Documents/KIELMC.dta"): cannot read
## factor labels from Stata 5 files
attach(nemovitost)
head(nemovitost)
## year age agesq nbh cbd intst lintst price rooms area land baths dist
## 1 1978 48 2304 4 3000 1000 6.9078 60000 7 1660 4578 1 10700
## 2 1978 83 6889 4 4000 1000 6.9078 40000 6 2612 8370 2 11000
## 3 1978 58 3364 4 4000 1000 6.9078 34000 6 1144 5000 1 11500
## 4 1978 11 121 4 4000 1000 6.9078 63900 5 1136 10000 1 11900
## 5 1978 48 2304 4 4000 2000 7.6009 44000 5 1868 10000 1 12100
## 6 1978 78 6084 4 3000 2000 7.6009 46000 6 1780 9500 3 10000
## ldist wind lprice y81 larea lland y81ldist lintstsq nearinc
## 1 9.277999 3 11.00210 0 7.414573 8.429017 0 47.71770 1
## 2 9.305651 3 10.59663 0 7.867871 9.032409 0 47.71770 1
## 3 9.350102 3 10.43412 0 7.042286 8.517193 0 47.71770 1
## 4 9.384294 3 11.06507 0 7.035269 9.210340 0 47.71770 1
## 5 9.400961 3 10.69195 0 7.532624 9.210340 0 57.77368 1
## 6 9.210340 3 10.73640 0 7.484369 9.159047 0 57.77368 1
## y81nrinc rprice lrprice
## 1 0 60000 11.00210
## 2 0 40000 10.59663
## 3 0 34000 10.43412
## 4 0 63900 11.06507
## 5 0 44000 10.69195
## 6 0 46000 10.73640
V prípade kdy byste neresili casovou dimenzi dat, udelali byste tzv. pool regresi.
\(price=\alpha_0 + \alpha_1 nearinc +\epsilon\)
regrese1=lm(rprice~nearinc)
summary(regrese1)
##
## Call:
## lm(formula = rprice ~ nearinc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65035 -18579 -5579 13965 233421
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91036 2081 43.752 < 2e-16 ***
## nearinc -24457 3805 -6.428 4.72e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31210 on 319 degrees of freedom
## Multiple R-squared: 0.1147, Adjusted R-squared: 0.1119
## F-statistic: 41.32 on 1 and 319 DF, p-value: 4.715e-10
V tomto prípade vidíte, ze oblast blízko spalovny ovlivnuje cenu nemovitostí. (predpokládáme splnení GM) Parametr \(\alpha_0\) predstavuje prumernou cenu nemovitosti, která není v blízkosti místa spalovny.
Jaká byla cena nemovitostí v dané oblasti pred plánovanou výstavbou spalovny?
Cena nemovitosté v roce 1978:
Nejprve si vytvoríme dummy promennou.
dummy_rok=as.numeric(nemovitost$year==1981)
Následne vybereme pouze ty data, která odpovídají roku 1978.
price78=rprice[which(dummy_rok == 0)]
nearinc78=nearinc[which(dummy_rok == 0)]
A provedeme regresi:
regrese78=lm(price78~nearinc78)
summary(regrese78)
##
## Call:
## lm(formula = price78 ~ nearinc78)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56517 -16605 -3193 8683 236307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82517 2654 31.094 < 2e-16 ***
## nearinc78 -18824 4745 -3.968 0.000105 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29430 on 177 degrees of freedom
## Multiple R-squared: 0.08167, Adjusted R-squared: 0.07648
## F-statistic: 15.74 on 1 and 177 DF, p-value: 0.0001054
Cena nemovitosté v roce 1981 po výstavbe spalovny:
price81=rprice[which(dummy_rok == 1)] # príkaz which vybere pouze promenné splnující danou podmínku
nearinc81=nearinc[which(dummy_rok == 1)]
regrese81=lm(price81~nearinc81)
summary(regrese81)
##
## Call:
## lm(formula = price81 ~ nearinc81)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60678 -19832 -2997 21139 136754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101308 3093 32.754 < 2e-16 ***
## nearinc81 -30688 5828 -5.266 5.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31240 on 140 degrees of freedom
## Multiple R-squared: 0.1653, Adjusted R-squared: 0.1594
## F-statistic: 27.73 on 1 and 140 DF, p-value: 5.139e-07
Díky výstavbe spalovny doslo ke zmene v cene nemovitostí:
\(\hat{\gamma_1}=(\overline{price}_{81,nr}-\overline{price}_{81,fr})-(\overline{price}_{78,nr}-\overline{price}_{78,fr})\)
\(\hat{\gamma_1}=(101308-30688-101308 )-(82517-18824-82517)=-11 863.9\)
toto je \(\textbf{difference-in-differences estimator}\)
Jak je to se statistickou významností? Je parametr \(\gamma_1\) statisticky významný? Na tuto otázku nám odpoví upravená regresní funkce:
\(price=\beta_0 + \gamma_0 y81 + \beta_1 nearinc+\gamma_1 y81\times nearinc +\epsilon\)
\(\beta_0\) prumerná cena nemovitosti v roce 1978, která nebyla blízko místa “dnesní” spalovny
\(\beta_1\) vliv daného místa na cenu nemovitosti, pred plánovanou stavbou spalovny
\(\beta_0+\gamma_0\) prumerná cena nemovitosti v roce 1981, která neni blízko místa spalovny
\(\gamma_1\) vliv spalovny na cenu nemovitostí
regrese2=lm(rprice~dummy_rok+nearinc+dummy_rok:nearinc)
summary(regrese2)
##
## Call:
## lm(formula = rprice ~ dummy_rok + nearinc + dummy_rok:nearinc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60678 -17693 -3031 12483 236307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82517 2727 30.260 < 2e-16 ***
## dummy_rok 18790 4050 4.640 5.12e-06 ***
## nearinc -18824 4875 -3.861 0.000137 ***
## dummy_rok:nearinc -11864 7457 -1.591 0.112595
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30240 on 317 degrees of freedom
## Multiple R-squared: 0.1739, Adjusted R-squared: 0.1661
## F-statistic: 22.25 on 3 and 317 DF, p-value: 4.224e-13
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(regrese2)
##
## studentized Breusch-Pagan test
##
## data: regrese2
## BP = 6.1999, df = 3, p-value = 0.1023
Zkusíme jeste sestavit White test.
white.test=lm(regrese2$residuals^2~regrese2$fitted.values+I(regrese2$fitted.values^2))
Doposud jsme predpokládali, ze pomocná regrese není zatízena heteroskedasticitou. Ale schválne si jí otestujeme.
bptest(white.test)
##
## studentized Breusch-Pagan test
##
## data: white.test
## BP = 5.0029, df = 2, p-value = 0.08196
Vidíme, ze je prítomná heteroskedasticita. Co tedy delat? Pouzijeme robustní odhad.
library(sandwich)
## Warning: package 'sandwich' was built under R version 3.2.5
coeftest(white.test)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9859e+10 8.2846e+09 2.3971 0.01710 *
## regrese2$fitted.values -4.4540e+05 2.0151e+05 -2.2103 0.02779 *
## I(regrese2$fitted.values^2) 2.5466e+00 1.1988e+00 2.1242 0.03442 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Vidíme, ze je prítomna heteroskedasticita. Pro úplnou správnost by bylo dobré udelat robustní test na sdruzenou signifikantnost parametru, ale to si muzete udelat doma.
Tedy výsledek pro nasí puvodní regresi je:
coeftest(regrese2,vcov)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82517.2 2726.9 30.2603 < 2.2e-16 ***
## dummy_rok 18790.3 4050.1 4.6395 5.117e-06 ***
## nearinc -18824.4 4875.3 -3.8612 0.0001368 ***
## dummy_rok:nearinc -11863.9 7456.6 -1.5911 0.1125948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(regrese78)
##
## Call:
## lm(formula = price78 ~ nearinc78)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56517 -16605 -3193 8683 236307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82517 2654 31.094 < 2e-16 ***
## nearinc78 -18824 4745 -3.968 0.000105 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29430 on 177 degrees of freedom
## Multiple R-squared: 0.08167, Adjusted R-squared: 0.07648
## F-statistic: 15.74 on 1 and 177 DF, p-value: 0.0001054
summary(regrese81)
##
## Call:
## lm(formula = price81 ~ nearinc81)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60678 -19832 -2997 21139 136754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101308 3093 32.754 < 2e-16 ***
## nearinc81 -30688 5828 -5.266 5.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31240 on 140 degrees of freedom
## Multiple R-squared: 0.1653, Adjusted R-squared: 0.1594
## F-statistic: 27.73 on 1 and 140 DF, p-value: 5.139e-07
Co kdyz zahrneme více promenných:
\(price=\beta_0 + \gamma_0 y81 + \beta_1 nearinc+\gamma_1 y81\times nearinc+\beta_2 age+ \beta_3 age^2+\epsilon\)
regrese3=lm(rprice~dummy_rok+nearinc+dummy_rok:nearinc+age+I(age^2))
summary(regrese3)
##
## Call:
## lm(formula = rprice ~ dummy_rok + nearinc + dummy_rok:nearinc +
## age + I(age^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -79349 -14431 -1711 10069 201486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.912e+04 2.406e+03 37.039 < 2e-16 ***
## dummy_rok 2.132e+04 3.444e+03 6.191 1.86e-09 ***
## nearinc 9.398e+03 4.812e+03 1.953 0.051713 .
## age -1.494e+03 1.319e+02 -11.333 < 2e-16 ***
## I(age^2) 8.691e+00 8.481e-01 10.248 < 2e-16 ***
## dummy_rok:nearinc -2.192e+04 6.360e+03 -3.447 0.000644 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25540 on 315 degrees of freedom
## Multiple R-squared: 0.4144, Adjusted R-squared: 0.4052
## F-statistic: 44.59 on 5 and 315 DF, p-value: < 2.2e-16
bptest(regrese3)
##
## studentized Breusch-Pagan test
##
## data: regrese3
## BP = 19.685, df = 5, p-value = 0.001432
coeftest(regrese3,vcov)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9117e+04 2.4061e+03 37.0385 < 2.2e-16 ***
## dummy_rok 2.1321e+04 3.4436e+03 6.1914 1.857e-09 ***
## nearinc 9.3979e+03 4.8122e+03 1.9529 0.0517131 .
## age -1.4944e+03 1.3186e+02 -11.3334 < 2.2e-16 ***
## I(age^2) 8.6913e+00 8.4813e-01 10.2476 < 2.2e-16 ***
## dummy_rok:nearinc -2.1920e+04 6.3597e+03 -3.4467 0.0006445 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Cenu domu bude ovlivnovat i jeho velikost:
\(price=\beta_0 + \gamma_0 y81 + \beta_1 nearinc+\gamma_1 y81\times nearinc+\beta_2 age+ \beta_3 age^2+\beta_4 area+\beta_5 rooms+\epsilon\)
regrese4=lm(rprice~dummy_rok+nearinc+dummy_rok:nearinc+age+I(age^2)+area+rooms)
summary(regrese4)
##
## Call:
## lm(formula = rprice ~ dummy_rok + nearinc + dummy_rok:nearinc +
## age + I(age^2) + area + rooms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84912 -9191 -485 7537 166695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.048e+04 1.067e+04 0.983 0.32653
## dummy_rok 1.350e+04 2.853e+03 4.732 3.37e-06 ***
## nearinc 7.494e+03 3.932e+03 1.906 0.05761 .
## age -8.536e+02 1.180e+02 -7.235 3.61e-12 ***
## I(age^2) 4.065e+00 7.799e-01 5.212 3.40e-07 ***
## area 2.209e+01 2.067e+00 10.686 < 2e-16 ***
## rooms 4.577e+03 1.668e+03 2.744 0.00642 **
## dummy_rok:nearinc -1.527e+04 5.146e+03 -2.967 0.00324 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20520 on 313 degrees of freedom
## Multiple R-squared: 0.6244, Adjusted R-squared: 0.616
## F-statistic: 74.35 on 7 and 313 DF, p-value: < 2.2e-16
bptest(regrese4)
##
## studentized Breusch-Pagan test
##
## data: regrese4
## BP = 39.321, df = 7, p-value = 1.698e-06
coeftest(regrese4,vcov)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10480.4474 10665.3250 0.9827 0.326531
## dummy_rok 13502.2854 2853.3679 4.7321 3.370e-06 ***
## nearinc 7493.6557 3932.2702 1.9057 0.057607 .
## age -853.5833 117.9767 -7.2352 3.605e-12 ***
## I(age^2) 4.0646 0.7799 5.2117 3.403e-07 ***
## area 22.0936 2.0674 10.6865 < 2.2e-16 ***
## rooms 4576.8166 1667.8999 2.7441 0.006419 **
## dummy_rok:nearinc -15269.7168 5146.3125 -2.9671 0.003238 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Co KDYBY promenné pocet pokoju a velikost nevyházely statisticky významné, napríklad v dusledku silné korelace. Vyhodili byste dané promenné? Nejprve bychom museli provést test na sdruzenou významnost.
\(H_0: \beta_4=\beta_5=0\) \(H_1: non H_0\)
regrese_omezena=lm(rprice~dummy_rok+nearinc+dummy_rok:nearinc+age+I(age^2))
waldtest(regrese4,regrese_omezena, vcov = vcovHC) # robustní verze Waldova testu
## Wald test
##
## Model 1: rprice ~ dummy_rok + nearinc + dummy_rok:nearinc + age + I(age^2) +
## area + rooms
## Model 2: rprice ~ dummy_rok + nearinc + dummy_rok:nearinc + age + I(age^2)
## Res.Df Df F Pr(>F)
## 1 313
## 2 315 -2 35.486 1.287e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(regrese3)
##
## Call:
## lm(formula = rprice ~ dummy_rok + nearinc + dummy_rok:nearinc +
## age + I(age^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -79349 -14431 -1711 10069 201486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.912e+04 2.406e+03 37.039 < 2e-16 ***
## dummy_rok 2.132e+04 3.444e+03 6.191 1.86e-09 ***
## nearinc 9.398e+03 4.812e+03 1.953 0.051713 .
## age -1.494e+03 1.319e+02 -11.333 < 2e-16 ***
## I(age^2) 8.691e+00 8.481e-01 10.248 < 2e-16 ***
## dummy_rok:nearinc -2.192e+04 6.360e+03 -3.447 0.000644 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25540 on 315 degrees of freedom
## Multiple R-squared: 0.4144, Adjusted R-squared: 0.4052
## F-statistic: 44.59 on 5 and 315 DF, p-value: < 2.2e-16
summary(regrese4)
##
## Call:
## lm(formula = rprice ~ dummy_rok + nearinc + dummy_rok:nearinc +
## age + I(age^2) + area + rooms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84912 -9191 -485 7537 166695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.048e+04 1.067e+04 0.983 0.32653
## dummy_rok 1.350e+04 2.853e+03 4.732 3.37e-06 ***
## nearinc 7.494e+03 3.932e+03 1.906 0.05761 .
## age -8.536e+02 1.180e+02 -7.235 3.61e-12 ***
## I(age^2) 4.065e+00 7.799e-01 5.212 3.40e-07 ***
## area 2.209e+01 2.067e+00 10.686 < 2e-16 ***
## rooms 4.577e+03 1.668e+03 2.744 0.00642 **
## dummy_rok:nearinc -1.527e+04 5.146e+03 -2.967 0.00324 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20520 on 313 degrees of freedom
## Multiple R-squared: 0.6244, Adjusted R-squared: 0.616
## F-statistic: 74.35 on 7 and 313 DF, p-value: < 2.2e-16
Cena je dobrým kandidátem na logaritmickou transformaci. Závislá promenná se rídí rozdelením náhodné slozky. Pokud pracujeme s predpoklady klasického lineárního modelu, tak náhodná slozka má normální rozdelení.
par(mfrow=c(1,2))
hist(nemovitost$rprice)
hist(nemovitost$lrprice)
Logaritmus ceny se více podobá normálnímu rozdelení.
\(ln(price)=\beta_0 + \gamma_0 y81 + \beta_1 nearinc+\gamma_1 y81\times nearinc +\epsilon\)
regrese_logaritmus=lm(log(rprice)~dummy_rok+nearinc+dummy_rok:nearinc)
bptest(regrese_logaritmus)
##
## studentized Breusch-Pagan test
##
## data: regrese_logaritmus
## BP = 9.1654, df = 3, p-value = 0.02717
coeftest(regrese_logaritmus)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.285424 0.030514 369.8386 < 2.2e-16 ***
## dummy_rok 0.193094 0.045321 4.2606 2.691e-05 ***
## nearinc -0.339923 0.054555 -6.2308 1.476e-09 ***
## dummy_rok:nearinc -0.062649 0.083441 -0.7508 0.4533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1