HT 2020

Birthweight Dataset

Mullahy (1997; ReStat)

# Load packages for robust standard errors (install them first!)
library(sandwich)
library(lmtest)

# Load the data from the wooldridge package (install it first!)
library(wooldridge)

# View the names of the columns
names(bwght)
##  [1] "faminc"   "cigtax"   "cigprice" "bwght"    "fatheduc" "motheduc"
##  [7] "parity"   "male"     "white"    "cigs"     "lbwght"   "bwghtlbs"
## [13] "packs"    "lfaminc"

Description of relevant variables: bwght

  • motheduc mother’s years of education
  • white equals one if white
  • faminc 1988 family income in $1000s
  • lfaminc log of faminc
  • cigs number of cigarettes smoked per day while pregnant
  • packs number of packes of smoked per day while pregnant

Part (a) - Define the outcome variable smokes

# Check that cigs and packs agree about who smoked during pregnancy
any(bwght$cigs == 0 & bwght$packs > 0)
## [1] FALSE
any(bwght$packs == 0 & bwght$cigs > 0)
## [1] FALSE
# Since they agree, we can use either to define smokes
bwght$smokes <- ifelse(bwght$cigs > 0, yes = 1, no = 0)

Part (a) - Fit a Probit Regression

smoking_model <- smokes ~ motheduc + white + lfaminc

probit <- glm(smoking_model, family = binomial(link = 'probit'), data = bwght)
coeftest(probit)
## 
## z test of coefficients:
## 
##              Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  1.126273   0.250374  4.4984 6.848e-06 ***
## motheduc    -0.145060   0.020697 -7.0087 2.405e-12 ***
## white        0.189677   0.110680  1.7137 0.0865758 .  
## lfaminc     -0.166911   0.049842 -3.3488 0.0008115 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Part (b) - Predictions for Alice and Beth

mean_faminc <- mean(bwght$faminc)
Alice <- c(motheduc = 12, white = 1, lfaminc = log(mean_faminc))
Beth <- c(motheduc = 16, white = 1, lfaminc = log(mean_faminc))
predict_me <- data.frame(rbind(Alice, Beth))
predictions <- predict(probit, newdata = predict_me, type = 'response')
predictions
##      Alice       Beth 
## 0.16183185 0.05853452
diff(predictions)
##       Beth 
## -0.1032973

Part (c) - Average partial effect of lfaminc

# Average of g(x'beta_hat) where g is the std. normal density: dnorm
# (predict defaults to the scale of x'beta_hat)
mean(dnorm(predict(probit))) * coef(probit)[4]
##     lfaminc 
## -0.03614676

Part (d) - Pseudo R-squared

# Fit model with only an intercept
model0 <- smokes ~ 1
probit0 <- glm(model0, family = binomial(link = 'probit'), data = bwght)

# Pseudo R-squared 
1 - logLik(probit) / logLik(probit0)
## 'log Lik.' 0.07838101 (df=4)