TT 2020

Count Data: Smoking Example

Mullahy (1997), Review of Economics and Statistics, 79, 596-593

library(wooldridge)
library(lmtest)
library(sandwich)

names(smoke)
##  [1] "educ"     "cigpric"  "white"    "age"      "income"   "cigs"    
##  [7] "restaurn" "lincome"  "agesq"    "lcigpric"

Variable Descriptions: smoke

# Specify x'beta
smoking_model <- cigs ~ lcigpric + lincome + restaurn + white + educ + age + agesq
  • cigs number of cigarettes smoked per day
  • lcigpric log of state cigarette price (cents/pack)
  • lincome log of annual income (US Dollars)
  • restaurn equals 1 if restaurant has smoking restrictions
  • white equals 1 if white
  • educ years of schooling
  • age age in years
  • agesq age squared

OLS with plain-vanilla SEs

ols <- lm(smoking_model, data = smoke)
coeftest(ols)
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -2.6824347 24.2207299 -0.1107   0.91184    
## lcigpric    -0.8509044  5.7823214 -0.1472   0.88305    
## lincome      0.8690144  0.7287636  1.1925   0.23344    
## restaurn    -2.8656213  1.1174059 -2.5645   0.01051 *  
## white       -0.5592363  1.4594610 -0.3832   0.70169    
## educ        -0.5017533  0.1671677 -3.0015   0.00277 ** 
## age          0.7745021  0.1605158  4.8251 1.676e-06 ***
## agesq       -0.0090686  0.0017481 -5.1878 2.699e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

OLS with Robust SEs

coeftest(ols, vcov. = vcovHC)
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -2.6824347 26.1562565 -0.1026  0.918343    
## lcigpric    -0.8509044  6.1132231 -0.1392  0.889334    
## lincome      0.8690144  0.6035374  1.4399  0.150296    
## restaurn    -2.8656213  1.0215479 -2.8052  0.005151 ** 
## white       -0.5592363  1.3943263 -0.4011  0.688468    
## educ        -0.5017533  0.1632410 -3.0737  0.002186 ** 
## age          0.7745021  0.1393883  5.5564 3.752e-08 ***
## agesq       -0.0090686  0.0014754 -6.1464 1.250e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Poisson Regression

pois_reg <- glm(smoking_model, family = poisson(link = 'log'), data = smoke)
coeftest(pois_reg)
## 
## z test of coefficients:
## 
##                Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept)  0.39644936  0.61394730   0.6457    0.5184    
## lcigpric    -0.10596071  0.14339006  -0.7390    0.4599    
## lincome      0.10372755  0.02028060   5.1146 3.144e-07 ***
## restaurn    -0.36360594  0.03122216 -11.6458 < 2.2e-16 ***
## white       -0.05520115  0.03741971  -1.4752    0.1402    
## educ        -0.05942253  0.00425626 -13.9612 < 2.2e-16 ***
## age          0.11425708  0.00496904  22.9938 < 2.2e-16 ***
## agesq       -0.00137082  0.00005695 -24.0704 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Quasi-Poisson Regression

quasipois <- glm(smoking_model, family = quasipoisson(link = 'log'), data = smoke)
coeftest(quasipois)
## 
## z test of coefficients:
## 
##                Estimate  Std. Error z value  Pr(>|z|)    
## (Intercept)  0.39644936  2.76738512  0.1433  0.886087    
## lcigpric    -0.10596071  0.64633482 -0.1639  0.869778    
## lincome      0.10372755  0.09141539  1.1347  0.256508    
## restaurn    -0.36360594  0.14073479 -2.5836  0.009777 ** 
## white       -0.05520115  0.16867044 -0.3273  0.743462    
## educ        -0.05942253  0.01918520 -3.0973  0.001953 ** 
## age          0.11425708  0.02239807  5.1012 3.375e-07 ***
## agesq       -0.00137082  0.00025671 -5.3400 9.292e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overdispersion or Underdispersion?

# Extract the estimate of sigma-squared
summary(quasipois)$dispersion
## [1] 20.31782
# Now do it "by hand" 
yhat <- predict(pois_reg, type = 'response')
uhat <- residuals(pois_reg, type = 'response')
mean(uhat^2 / yhat)
## [1] 20.11488

Robust “Sandwich” SEs for Poisson Regression

coeftest(pois_reg, vcov. = vcovHC)
## 
## z test of coefficients:
## 
##               Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  0.3964494  3.0227099  0.1312   0.89565    
## lcigpric    -0.1059607  0.6787042 -0.1561   0.87594    
## lincome      0.1037276  0.0844975  1.2276   0.21960    
## restaurn    -0.3636059  0.1417068 -2.5659   0.01029 *  
## white       -0.0552011  0.1662139 -0.3321   0.73981    
## educ        -0.0594225  0.0194336 -3.0577   0.00223 ** 
## age          0.1142571  0.0214898  5.3168 1.056e-07 ***
## agesq       -0.0013708  0.0002476 -5.5365 3.086e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comparing OLS and Poisson Estimates

ybar <- mean(smoke$cigs)
OLS_est <- coefficients(ols)[-1]
pois_est <- coefficients(pois_reg)[-1]
cbind(OLS = OLS_est, Poisson_APE = ybar * pois_est, Poisson = pois_est)
##                   OLS Poisson_APE      Poisson
## lcigpric -0.850904380 -0.92042698 -0.105960710
## lincome   0.869014392  0.90102862  0.103727546
## restaurn -2.865621339 -3.15846053 -0.363605941
## white    -0.559236320 -0.47950441 -0.055201150
## educ     -0.501753267 -0.51617344 -0.059422535
## age       0.774502141  0.99249336  0.114257081
## agesq    -0.009068603 -0.01190761 -0.001370819

Note the age and agesq entries are not partial effects. Why not?