Exercise 1 Solution

Includes:

Lion noses linear regression
Data generation consistent with model
Linear regression of this first dataset
In-class Sampling Distribution Simulation Assignment

Document Preamble

Load libraries

library(knitr)
library(abd)

Settings for Knitr (optional)

opts_chunk$set(fig.width = 8, fig.height = 6)

1. Lion noses linear regression:

Data entry

data(LionNoses)
head(LionNoses)

##   age proportion.black
## 1 1.1             0.21
## 2 1.5             0.14
## 3 1.9             0.11
## 4 2.2             0.13
## 5 2.6             0.12
## 6 3.2             0.13

Fit linear model

lm.nose<-lm(age~proportion.black, data=LionNoses)

Parameters:

Coefficients and residual variation are stored in lmfit:

coef(lm.nose)

##      (Intercept) proportion.black 
##        0.8790062       10.6471194

summary(lm.nose)$sigma # residual variation

## [1] 1.668764

What else is stored in lmfit? (residuals, variance covariance matrix, etc)

names(lm.nose)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

names(summary(lm.nose))

##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"

2. Data generation consistent with fitted model

## Use the same sampmle size Sample size - use length so it matches sample size of original data
n <- length(LionNoses$age)

## Predictor - copy of original proporation black data, now in vector
p.black <- LionNoses$proportion.black

## Parameters
sigma <- summary(lm.nose)$sigma # residual variation
betas <- coef(lm.nose)# regression coefficients

## Errors and response
# Residual errors are modeled as ~ N(0, sigma)
epsilon <- rnorm(n, 0, sigma)

# Response is modeled as linear function plus residual errors
y <- betas[1] + betas[2]*p.black + epsilon

3. Linear regression of this generated dataset

# Fit of model to simulated data:  
lmfit.generated <- lm(y ~ p.black)
summary(lmfit.generated)

## 
## Call:
## lm(formula = y ~ p.black)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6297 -1.6515  0.3908  1.4158  2.8638 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.3638     0.6228   0.584    0.563    
## p.black      11.2055     1.6527   6.780 1.61e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.827 on 30 degrees of freedom
## Multiple R-squared:  0.6051, Adjusted R-squared:  0.5919 
## F-statistic: 45.97 on 1 and 30 DF,  p-value: 1.613e-07

In-Class Sampling Distribution Simulation Assignment

Exercise 1:

Generate 5000 datasets using the same code
Fit a linear regression model to each dataset “lm.temp”
Store the estimates of \(\beta_1\)

Hint: if you get stuck, try starting with a small number of simulations (less than 5000) until you get the code right.

#   set up a matrix of size 5000 by 1 to store our estimates of beta_1
nsims <- 5000 # number of simulations
beta.hat<- matrix(NA,   nrow    =   nsims,  ncol    =   1)

# Simulation
for(i in 1:nsims){
  epsilon <- rnorm(n, 0, sigma) # random errors
  y <- betas[1] + betas[2]*p.black + epsilon # response
  lm.temp <- lm(y ~ p.black)
  ## extract beta-hat  
  beta.hat[i] <- coef(lm.temp)[2] 
}

Plot results

hist(beta.hat, col="gray",xlab="", main=expression(paste("Sampling Distribution of ", hat(beta)[1])))
abline(v=betas[2]) # add population parameter

Document footer

Session Information:

sessionInfo()

## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] abd_0.2-8         mosaic_1.8.3      ggridges_0.5.3    mosaicData_0.20.2
##  [5] ggformula_0.10.1  ggstance_0.3.5    dplyr_1.0.9       Matrix_1.4-1     
##  [9] ggplot2_3.3.6     lattice_0.20-45   nlme_3.1-157      knitr_1.39       
## 
## loaded via a namespace (and not attached):
##  [1] ggrepel_0.9.1     Rcpp_1.0.9        tidyr_1.2.0       assertthat_0.2.1 
##  [5] digest_0.6.29     utf8_1.2.2        ggforce_0.3.3     R6_2.5.1         
##  [9] plyr_1.8.7        backports_1.4.1   labelled_2.9.1    evaluate_0.16    
## [13] highr_0.9         pillar_1.8.0      rlang_1.0.4       rstudioapi_0.13  
## [17] jquerylib_0.1.4   rmarkdown_2.14    splines_4.2.1     readr_2.1.2      
## [21] stringr_1.4.0     htmlwidgets_1.5.4 polyclip_1.10-0   munsell_0.5.0    
## [25] broom_1.0.0       compiler_4.2.1    xfun_0.31         pkgconfig_2.0.3  
## [29] htmltools_0.5.3   tidyselect_1.1.2  gridExtra_2.3     tibble_3.1.8     
## [33] mosaicCore_0.9.0  fansi_1.0.3       withr_2.5.0       tzdb_0.3.0       
## [37] MASS_7.3-57       jsonlite_1.8.0    gtable_0.3.0      lifecycle_1.0.1  
## [41] DBI_1.1.3         magrittr_2.0.3    scales_1.2.0      cli_3.3.0        
## [45] stringi_1.7.8     cachem_1.0.6      farver_2.1.1      leaflet_2.1.1    
## [49] bslib_0.4.0       ggdendro_0.1.23   ellipsis_0.3.2    generics_0.1.3   
## [53] vctrs_0.4.1       tools_4.2.1       forcats_0.5.1     glue_1.6.2       
## [57] tweenr_1.0.2      purrr_0.3.4       crosstalk_1.2.0   hms_1.1.1        
## [61] fastmap_1.1.0     yaml_2.3.5        colorspace_2.0-3  haven_2.5.0      
## [65] sass_0.4.2