Model basics

The goal of a model is to provide a simple low-dimensional summary of a dataset. In the context of this book we’re going to use models to partition data into patterns and residuals. Strong patterns will hide subtler trends, so we’ll use models to help peel back layers of structure as we explore a dataset.

Simple model

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ----------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(modelr)
options(na.action = na.warn)

# load data
# download.file("https://github.com/hadley/modelr/blob/master/data/sim1.rda", "sim1.rda")
# load("sim1.rda")

ggplot(sim1, aes(x, y)) + geom_point()

models <- tibble(
  a1 = runif(250, -20, 40),
  a2 = runif(250, -5, 5)
)

ggplot(sim1, aes(x, y)) + geom_point() + 
  geom_abline(aes(intercept = a1, slope = a2), data = models, alpha = 1/4)

Model family

model1 <- function(a, data) {
  a[1] + data$x * a[2]
}

model1(c(7, 1.5), sim1)
##  [1]  8.5  8.5  8.5 10.0 10.0 10.0 11.5 11.5 11.5 13.0 13.0 13.0 14.5 14.5
## [15] 14.5 16.0 16.0 16.0 17.5 17.5 17.5 19.0 19.0 19.0 20.5 20.5 20.5 22.0
## [29] 22.0 22.0

Root-mean-squared deviation

measure_distance <- function(mod, data) {
  diff <- data$y - model1(mod, data)
  sqrt(mean(diff ^ 2))
}

measure_distance(c(7, 1.5), sim1)
## [1] 2.665212

Compute the distance for all the models defined above

sim1_dist <- function(a1, a2) {
  measure_distance(c(a1, a2), sim1)
}

models <- models %>% 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
models
## # A tibble: 250 x 3
##         a1     a2  dist
##      <dbl>  <dbl> <dbl>
##  1 12.3     0.253  5.87
##  2 11.1    -3.88  30.9 
##  3 24.7     4.68  35.8 
##  4 31.1    -3.11  15.0 
##  5 33.5    -3.41  15.8 
##  6 15.0     0.876  5.90
##  7 25.5    -3.81  20.2 
##  8  0.0769  4.81  13.7 
##  9 20.7     0.154  8.42
## 10 38.2    -2.52  16.0 
## # ... with 240 more rows
ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(models, rank(dist) <= 10)
  )

ggplot(models, aes(a1, a2)) +
  geom_point(data = filter(models, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist))

Grid

grid <- expand.grid(
  a1 = seq(-5, 20, length = 25),
  a2 = seq(1, 3, length = 25)
  ) %>% 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))

grid %>% 
  ggplot(aes(a1, a2)) +
  geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist)) 

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(grid, rank(dist) <= 10)
  )

Newton Raphson Method: https://brilliant.org/wiki/newton-raphson-method/

best <- optim(c(0, 0), measure_distance, data = sim1)
best$par
## [1] 4.222248 2.051204
ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(intercept = best$par[1], slope = best$par[2])

Linear model

sim1_mod = lm(y ~ x, data = sim1)
coef(sim1_mod)
## (Intercept)           x 
##    4.220822    2.051533

Visualising models

grid <- sim1 %>% 
  data_grid(x) 
grid
## # A tibble: 10 x 1
##        x
##    <int>
##  1     1
##  2     2
##  3     3
##  4     4
##  5     5
##  6     6
##  7     7
##  8     8
##  9     9
## 10    10
grid <- grid %>% 
  add_predictions(sim1_mod) 
grid
## # A tibble: 10 x 2
##        x  pred
##    <int> <dbl>
##  1     1  6.27
##  2     2  8.32
##  3     3 10.4 
##  4     4 12.4 
##  5     5 14.5 
##  6     6 16.5 
##  7     7 18.6 
##  8     8 20.6 
##  9     9 22.7 
## 10    10 24.7

Residuals

sim1 = sim1 %>% 
  add_residuals(sim1_mod)

ggplot(sim1, aes(x)) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = pred), data = grid, colour = "red", size = 1)

ggplot(sim1, aes(resid)) + geom_freqpoly(binwidth = 0.5)

ggplot(sim1, aes(x, resid)) +
  geom_ref_line(h = 0) +
  geom_point()

# This looks like random noise, suggesting that our model has done a good job of capturing the patterns in the dataset.