Statistical-Rethinking/Exercises chapter 9.Rmd at main · martsc1/Statistical-Rethinking · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
Exercises chapter 9

```{r}
num_weeks <- 1e5
positions <- rep(0,num_weeks)
current <- 10
for ( i in 1:num_weeks ) {
  ## record current position
  positions[i] <- current
  ## flip coin to generate proposal
  proposal <- current + sample( c(-1,1) , size=1 )
  ## now make sure he loops around the archipelago
  if ( proposal < 1 ) proposal <- 10
  if ( proposal > 10 ) proposal <- 1
  ## move?
  prob_move <- proposal/current
  current <- ifelse( runif(1) < prob_move , proposal , current )
}

plot( 1:100 , positions[1:100] )
plot( table( positions ) )
```

```{r}
D <- 10
T <- 1e3
Y <- rmvnorm(T,rep(0,D),diag(D))
rad_dist <- function( Y ) sqrt( sum(Y^2) )
Rd <- sapply( 1:T , function(i) rad_dist( Y[i,] ) )
dens( Rd )
```

9.4 Easy HMC: ulam
```{r}
library(rethinking)
data(rugged)
d <- rugged
d$log_gdp <- log(d$rgdppc_2000)
dd <- d[ complete.cases(d$rgdppc_2000) , ]
dd$log_gdp_std <- dd$log_gdp / mean(dd$log_gdp)
dd$rugged_std <- dd$rugged / max(dd$rugged)
dd$cid <- ifelse( dd$cont_africa==1 , 1 , 2 )

m8.3 <- quap(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dd )
precis( m8.3 , depth=2 )

dat_slim <- list(
log_gdp_std = dd$log_gdp_std,
rugged_std = dd$rugged_std,
cid = as.integer( dd$cid )
)
str(dat_slim)

m9.1 <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=1 )

precis( m9.1 , depth=2 )

m9.1 <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4 , cores=4 )

show(m9.1)
pairs( m9.1 )
traceplot( m9.1 )
trankplot( m9.1 )

```

```{r}
y <- c(-1,1)
set.seed(11)
m9.2 <- ulam(
  alist(
    y ~ dnorm( mu , sigma ) ,
    mu <- alpha ,
    alpha ~ dnorm( 0 , 1000 ) ,
    sigma ~ dexp( 0.0001 )
  ) , data=list(y=y) , chains=3 )

precis( m9.2 )
pairs( m9.2@stanfit )
traceplot(m9.2)
trankplot(m9.2)


set.seed(11)
m9.3 <- ulam(
  alist(
    y ~ dnorm( mu , sigma ) ,
    mu <- alpha ,
    alpha ~ dnorm( 1 , 10 ) ,
    sigma ~ dexp( 1 )
  ) , data=list(y=y) , chains=3, cores=3 )
precis( m9.3 )


```

```{r}
set.seed(41)
y <- rnorm( 100 , mean=0 , sd=1 )

set.seed(384)
m9.4 <- ulam(
  alist(
    y ~ dnorm( mu , sigma ) ,
    mu <- a1 + a2 ,
    a1 ~ dnorm( 0 , 1000 ),
    a2 ~ dnorm( 0 , 1000 ),
    sigma ~ dexp( 1 )
  ) , data=list(y=y) , chains=3, cores=3 )
precis( m9.4 )

m9.5 <- ulam(
  alist(
    y ~ dnorm( mu , sigma ) ,
    mu <- a1 + a2 ,
    a1 ~ dnorm( 0 , 10 ),
    a2 ~ dnorm( 0 , 10 ),
    sigma ~ dexp( 1 )
  ) , data=list(y=y) , chains=3, cores=3 )
precis( m9.5 )

```

######################### Exercises #########################
### Easy
9E1. Which of the following is a requirement of the simple Metropolis algorithm?
(1) The parameters must be discrete.
(2) The likelihood function must be Gaussian.
(3) The proposal distribution must be symmetric.

- 3

9E2. Gibbs sampling is more efficient than the Metropolis algorithm. How does it achieve this extra
efficiency? Are there any limitations to the Gibbs sampling strategy?

The improvement in efficiency of Gibbs sampling arises from adaptive proposals in which the distribution
of proposed parameter values adjusts itself intelligently, depending upon the parameter
values at the moment.

Some drawbacks of Gibbs sampling are that
- it is dependent on conjugate priors
- as models become more complex and contain hundreds or thousands or tens
of thousands of parameters, both Metropolis and Gibbs sampling become shockingly inefficient.


9E3. Which sort of parameters can Hamiltonian Monte Carlo not handle? Can you explain why?
HMC cannot handle discrete parameters. It needs continuous parameters to perform its physics simulation of a gliding particle.

9E4. Explain the difference between the effective number of samples, n_eff as calculated by Stan,
and the actual number of samples.

Stan provides an estimate of effective number of samples, for the purpose of estimating the posterior mean,
as n_eff. You can think of n_eff as the length of a Markov chain with no autocorrelation
that would provide the same quality of estimate as your chain.


9E5. Which value should Rhat approach, when a chain is sampling the posterior distribution correctly?
1

9E6. Sketch a good trace plot for a Markov chain, one that is effectively sampling from the posterior
distribution. What is good about its shape? Then sketch a trace plot for a malfunctioning Markov
chain. What about its shape indicates malfunction?
- good chain looks like a hairy caterpillar that is quite straight
- a bad chain has huge outliers and jumps more all over the place (huge spikes)

9E7. Repeat the problem above, but now for a trace rank plot.
- a good trace rank plot has a lot of overlap for the different chains
- a bad trace rank plot has less overlap, and more dispersion for the different chains.


### Medium
9M1. Re-estimate the terrain ruggedness model from the chapter, but now using a uniform prior
for the standard deviation, sigma. The uniform prior should be dunif(0,1). Use ulam to estimate
the posterior. Does the different prior have any detectible influence on the posterior distribution of
sigma? Why or why not?

```{r}
library(rethinking)
data(rugged)
d <- rugged
d$log_gdp <- log(d$rgdppc_2000)
dd <- d[ complete.cases(d$rgdppc_2000) , ]
dd$log_gdp_std <- dd$log_gdp / mean(dd$log_gdp)
dd$rugged_std <- dd$rugged / max(dd$rugged)
dd$cid <- ifelse( dd$cont_africa==1 , 1 , 2 )

dat_slim <- list(
log_gdp_std = dd$log_gdp_std,
rugged_std = dd$rugged_std,
cid = as.integer( dd$cid )
)
str(dat_slim)

m9.1 <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=1 )

m9.1u <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dunif(0,1)
  ) , data=dat_slim , chains=1 )

precis( m9.1 , depth=2 )
precis( m9.1u , depth=2 )
coeftab(m9.1,m9.1u)
```
The prior does not seem to have made a difference. Probably because both priors are moderate and therefore quickly overwhelmed by the actual data.

9M2. Modify the terrain ruggedness model again. This time, change the prior for b[cid] to dexp(0.3).
What does this do to the posterior distribution? Can you explain it?

```{r}
m9.1e <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dexp( 0.3 ) ,
    sigma ~ dexp(1)
  ) , data=dat_slim , chains=1 )

precis( m9.1e , depth=2 )
traceplot(m9.1e)
coeftab(m9.1,m9.1u,m9.1e)
```
Changing the prior for b[cid] to an exponential did affect the posterior distribution, at least for b[2], because it constrains it to positive values. The models without this constraint return a negative value for b[2].


9M3. Re-estimate one of the Stan models from the chapter, but at different numbers of warmup iterations.
Be sure to use the same number of sampling iterations in each case. Compare the n_eff
values. How much warmup is enough?

```{r}
library(rethinking)
data(rugged)
d <- rugged
d$log_gdp <- log(d$rgdppc_2000)
dd <- d[ complete.cases(d$rgdppc_2000) , ]
dd$log_gdp_std <- dd$log_gdp / mean(dd$log_gdp)
dd$rugged_std <- dd$rugged / max(dd$rugged)
dd$cid <- ifelse( dd$cont_africa==1 , 1 , 2 )

dat_slim <- list(
log_gdp_std = dd$log_gdp_std,
rugged_std = dd$rugged_std,
cid = as.integer( dd$cid )
)

m9.1a <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4, cores=4 , iter = 1000, warmup = 500)

m9.1b <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4, cores=4 , iter = 1000, warmup = 250)

m9.1c <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4, cores=4 , iter = 1000, warmup = 100)

m9.1d <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4, cores=4 , iter = 1000, warmup = 50)

m9.1e <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4, cores=4 , iter = 1000, warmup = 25)

m9.1f <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4, cores=4 , iter = 1000, warmup = 10)

m9.1g <- ulam(
  alist(
    log_gdp_std ~ dnorm( mu , sigma ) ,
    mu <- a[cid] + b[cid]*( rugged_std - 0.215 ) ,
    a[cid] ~ dnorm( 1 , 0.1 ) ,
    b[cid] ~ dnorm( 0 , 0.3 ) ,
    sigma ~ dexp( 1 )
  ) , data=dat_slim , chains=4, cores=4 , iter = 1000, warmup = 5)

coeftab(m9.1a,m9.1b,m9.1c,m9.1d,m9.1e,m9.1f,m9.1g)
precis(m9.1a, depth = 2)
precis(m9.1b, depth = 2)
precis(m9.1c, depth = 2)
precis(m9.1d, depth = 2)
precis(m9.1e, depth = 2)
precis(m9.1f, depth = 2)
precis(m9.1g, depth = 2)

traceplot(m9.1f)
```
Interestingly, reducing the number of warm up samples has a relatively unpredictable outcome, leading sometimes even to increases in the number of effective samples. For this example, you might even get away with using only 25 warm up rounds as you still get a decent traceplot and multiple hundreds of effective samples for each predictor.
However, the highest number of effective samples you seem to get with 250 warm up rounds. Note, the number of chains used also affects these results, and with fewer chains there seems to be a lower penalty of using fewer warmup rounds.

### Hard

9H2. Recall the divorce rate example from Chapter 5. Repeat that analysis, using ulam this time,
fitting models m5.1, m5.2, and m5.3. Use compare to compare the models on the basis of WAIC
or PSIS. To use WAIC or PSIS with ulam, you need add the argument log_log=TRUE. Explain the
model comparison results.

```{r}
data(WaffleDivorce)
d <- WaffleDivorce
# standardize variables
d$D <- standardize( d$Divorce )
d$M <- standardize( d$Marriage )
d$A <- standardize( d$MedianAgeMarriage )

dat_slim <- list(
Divorce = d$D,
Marriage = d$M,
Age = d$A)

m5.1 <- ulam(
  alist(
    Divorce ~ dnorm( mu , sigma ) ,
    mu    <- a + bA * Age ,
    a     ~ dnorm( 0 , 0.2 ) ,
    bA    ~ dnorm( 0 , 0.5 ) ,
    sigma ~ dexp( 1 )
  ) , data = dat_slim , chains=1, log_lik = TRUE )

m5.2 <- ulam(
  alist(
    Divorce ~ dnorm( mu , sigma ) ,
    mu <- a + bM * Marriage ,
    a ~ dnorm( 0 , 0.2 ) ,
    bM ~ dnorm( 0 , 0.5 ) ,
    sigma ~ dexp( 1 )
  ) , data = dat_slim , chains=1, log_lik = TRUE )

m5.3 <- ulam(
  alist(
    Divorce ~ dnorm( mu , sigma ) ,
    mu <- a + bM*Marriage + bA*Age ,
    a ~ dnorm( 0 , 0.2 ) ,
    bM ~ dnorm( 0 , 0.5 ) ,
    bA ~ dnorm( 0 , 0.5 ) ,
    sigma ~ dexp( 1 )
  ) , data = dat_slim , chains=1, log_lik = TRUE )


compare(m5.1,m5.2,m5.3, func = PSIS)


```
The results seem the same as for the quap models. The simple model just taking age into account performs best.