## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>")
oldopts <- options(digits = 3, scipen = 999)
print_num <- function(x) print(sprintf("%.3f", x))

## ----install, eval = FALSE----------------------------------------------------
# # From CRAN
# install.packages("algebraic.mle")
# 
# # Development version from GitHub
# devtools::install_github("queelius/algebraic.mle")

## ----setup, include = FALSE---------------------------------------------------
library(algebraic.dist)
library(algebraic.mle)
library(numDeriv)
library(MASS)
library(mvtnorm)
has_ggplot2 <- requireNamespace("ggplot2", quietly = TRUE) &&
               requireNamespace("tibble", quietly = TRUE)
if (has_ggplot2) {
  library(ggplot2)
  library(tibble)
}
set.seed(1234)

## -----------------------------------------------------------------------------
fit_normal <- function(data) {
    sigma <- function(data) {
        mean((data - mean(data))^2)
    }
    loglik <- function(par, data) {
        n <- length(data)
        -n / 2 * log(2 * pi * par[2]) - 1 / (2 * par[2]) *
            (sum(data^2) - 2 * par[1] * sum(data) + n * par[1]^2)
    }
    par.hat <- c(mu = mean(data), var = sigma(data))
    H <- numDeriv::hessian(func = loglik, x = par.hat, data = data)
    algebraic.mle::mle(
        theta.hat = par.hat,
        loglike = loglik(par.hat, data),
        score = numDeriv::grad(func = loglik, x = par.hat, data = data),
        sigma = MASS::ginv(-H),
        info = -H,
        obs = NULL,
        nobs = length(data),
        superclasses = c("mle_normal"))
}

## ----theta-samp-mc------------------------------------------------------------
theta_samp_mc <- function(n, theta, B = 10000) {
    mu <- theta[1]
    var <- theta[2]
    mles <- matrix(NA, nrow = B, ncol = 2)
    for (i in 1:B) {
        d <- rnorm(n, mean = mu, sd = sqrt(var))
        mles[i, ] <- params(fit_normal(d))
    }
    colnames(mles) <- c("mu", "var")
    mles
}

## ----mc-samp, cache=TRUE------------------------------------------------------
# Set up the parameters of a simulation
set.seed(913254)
n <- 70
mu <- 1
var <- 1
B <- 1000
theta <- c(mu, var)
mles <- theta_samp_mc(n = n, theta = theta, B = B)
head(mles)

## ----fig.width=4, fig.height=4, echo=FALSE, fig.align="center", fig.cap="Sampling distribution of the MLEs.", warning=FALSE, cache=TRUE, eval=has_ggplot2----
tmp <- colMeans(mles)
# countour plot of the sampling distribution `mles` and with the point of the
# true mean and variance, theta = (mu, var), shown. label it "true parameter"
ggplot(as_tibble(mles), aes(x = mu, y = var)) +
    labs(x = expression(mu), y = expression(sigma^2)) +
    theme_bw() +
    geom_hline(yintercept = var) +
    geom_vline(xintercept = mu) + 
    geom_point(aes(x = tmp[1], y = tmp[2]), color = "green") +
    geom_point(aes(x = mu, y = var), color = "red") +
    stat_density2d(aes(fill = after_stat(level)), geom = "polygon", alpha = 0.2) +
    scale_fill_gradient(low = "blue", high = "red")

## -----------------------------------------------------------------------------
theta.mc <- algebraic.dist::empirical_dist(mles)

## -----------------------------------------------------------------------------
(mu.mc <- mean(theta.mc))

## ----expectation-tests, cache = TRUE------------------------------------------
# should sum to 1
expectation(theta.mc, function(x) 1)
# mean
expectation(theta.mc, function(x) x)
# variance of (mu, var)
expectation(theta.mc, function(x) (x - mu.mc)^2)
# kurtosis of (mu, var) 
expectation(theta.mc, function(x) (x - mu.mc)^4) /
    expectation(theta.mc, function(x) (x - mu.mc)^2)^2
# skewness of mu and var -- should be (0, 0)
expectation(theta.mc, function(x) ((x - mu.mc) / theta)^3)
# covariance of (mu, var) -- should be around 0
expectation(theta.mc, function(x) (x[1] - mu.mc[1]) * (x[2] - mu.mc[2]))

## -----------------------------------------------------------------------------
bias.mle_normal <- function(x, par = NULL, ...) {
    if (is.null(par)) {
        par <- params(x)
    }
    c(mu = 0, var = -(1 / nobs(x)) * par[2])
}

## ----bias-1-------------------------------------------------------------------
# first, we sample some data from the true distribution
data <- rnorm(n = n, mean = mu, sd = sqrt(var))

# now we fit it to the normal distribution
theta.hat <- fit_normal(data)

# now we compute the bias, first using the asymptotic theory
bias(theta.hat, theta)
# now using the empirical sampling distribution
expectation(theta.mc, function(x) x - theta) # mean(theta.mc) - theta

## ----bias-code, cache = TRUE--------------------------------------------------
N <- 1000
ns <- seq(10, 500, 10)
bias_var <- numeric(length(ns))
j <- 1
for (n in ns) {
    vars <- numeric(length(N))
    for (i in 1:N) {
        d <- rnorm(n = n, mean = mu, sd = sqrt(var))
        fit <- fit_normal(d)
        vars[i] <- params(fit)[2]
    }
    bias_var[j] <- mean(vars) - var
    j <- j + 1
}

## ----bias-plot, echo=FALSE, fig.height=4, fig.width=6-------------------------
plot(ns, bias_var, type = "l",
    xlab = "Sample size", ylab = "Bias(Var)")
# plot the asymptotic bias
lines(ns, -(1 / ns) * var, lty = 2)

## -----------------------------------------------------------------------------
round(vcov(theta.hat), digits=3)
round(vcov(theta.mc), digits=3)

## ----confint-1----------------------------------------------------------------
confint(theta.hat)

## ----covearge-prob, cache = TRUE----------------------------------------------
N <- 1000
ns <- c(seq(50, 200, 50), 300, 600, 1000)
coverage_prob <- matrix(NA, nrow=length(ns), ncol=2)
j <- 1
for (n in ns) {
    count1 <- 0L
    count2 <- 0L
    for (i in 1:N) {
        d <- rnorm(n = n, mean = mu, sd = sqrt(var))
        fit <- fit_normal(d)
        ci <- confint(fit)
        if (ci[1, 1] <= mu && mu <= ci[1, 2]) {
            count1 <- count1 + 1
        }
        if (ci[2, 1] <= var && var <= ci[2, 2]) {
            count2 <- count2 + 1
        }
    }
    coverage_prob[j, 1] <- count1 / N
    coverage_prob[j, 2] <- count2 / N
    j <- j + 1
}

## ----cov-plot, echo = FALSE, fig.height = 4, fig.width = 5, cache = TRUE------
plot(ns, coverage_prob[, 2], type = "l",
    xlab = "Sample size", col = "blue", ylab = "Coverage")
lines(ns, coverage_prob[, 1], col = "green")
legend("bottomright", legend = c("mu", "var"), col = c("blue", "green"), lty = 1)
abline(h = 0.95, lty = 2)

## ----mse-1--------------------------------------------------------------------
mse.hat <- mse(theta.hat, theta)
mse.mc <- matrix(expectation(theta.mc,
    function(x) (x - theta) %*% t(x - theta)), nrow = 2)

round(mse.hat, digits = 3)
round(mse.mc, digits = 3)

## -----------------------------------------------------------------------------
# temporarily show more digits in the numbers/outputs for this code block
op <- options(digits = 12)
# mse(mu)
expectation(theta.mc, function(x) (x[1] - mu)^2)
# variance(mu)
(mu.var <- expectation(theta.mc, function(x) (x[1] - mean(theta.mc)[1])^2))
b <- expectation(theta.mc, function(x) x[1] - mu)
# mse = bias^2 + variance
b^2 + mu.var
options(op)

## ----mse-sample-size, cache = TRUE--------------------------------------------
ns <- seq(25, 200, 25)
mses.mc <- matrix(NA, nrow = length(ns), ncol = 2)
mses.hat <- matrix(NA, nrow = length(ns), ncol = 2)
mses.hat.hat <- matrix(NA, nrow = length(ns), ncol = 2)
j <- 1
for (n in ns) {
    theta.n <- empirical_dist(theta_samp_mc(n = n, theta = theta, B = B))
    # Use mean() on the sampled distribution for MSE calculation
    samples <- obs(theta.n)
    mse.mu.n <- mean((samples[, 1] - mu)^2)
    mse.var.n <- mean((samples[, 2] - var)^2)
    data <- rnorm(n = n, mean = mu, sd = sqrt(var))
    fit <- fit_normal(data)
    mses.mc[j, ] <- c(mse.mu.n, mse.var.n)
    mses.hat[j, ] <- diag(mse(fit, theta))
    mses.hat.hat[j, ] <- diag(mse(fit))
    j <- j + 1
}

## ----mse-plots, echo = FALSE, fig.height = 4, fig.width = 6-------------------
plot(ns, mses.mc[, 1], type = "l", col = rgb(0,0,1,0.5), ylab = "MSE(mu)", xlab = "Sample size") # blue with transparency
lines(ns, mses.hat[, 1], col = rgb(0,1,0,0.5)) # green with transparency
lines(ns, mses.hat.hat[, 1], col = rgb(1,0,0,0.5)) # red with transparency
legend("topright", legend = c("MC", "Asymptotic", "Asymptotic (estimate)"), 
    col = c(rgb(0,0,1,0.5), rgb(0,1,0,0.5), rgb(1,0,0,0.5)), lty = 1)

plot(ns, mses.mc[, 2], type = "l", col = rgb(0,0,1,0.5), ylab = "MSE(var)", xlab = "Sample size") # blue with transparency
lines(ns, mses.hat[, 2], col = rgb(0,1,0,0.5)) # green with transparency
lines(ns, mses.hat.hat[, 2], col = rgb(1,0,0,0.5)) # red with transparency
legend("topright", legend = c("MC", "Asymptotic", "Asymptotic (estimate)"), 
    col = c(rgb(0,0,1,0.5), rgb(0,1,0,0.5), rgb(1,0,0,0.5)), lty = 1)

## ----mle-boot, cache=TRUE-----------------------------------------------------
# Simulate a sample of n observations from a normal with mean 1 and variance 2.
library(boot)
theta.boot <- mle_boot(boot(
    data = data,
    statistic = function(x, ind) {
        params(fit_normal(x[ind]))
    },
    R = B))

## -----------------------------------------------------------------------------
params(theta.boot)
confint(theta.boot)

## -----------------------------------------------------------------------------
theta.b <- empirical_dist(theta.boot$t)

## ----expectation-tests-boot, cache = TRUE-------------------------------------
# should sum to 1
expectation(theta.b, function(x) 1)
# mean
(mu.b <- mean(theta.b))
# variance of (mu, var)
expectation(theta.b, function(x) (x - mu.b)^2)
# kurtosis of (mu, var) 
expectation(theta.b, function(x) (x - mu.b)^4) /
    expectation(theta.b, function(x) (x - mu.b)^2)^2
# skewness of mu and var -- should be (0, 0)
expectation(theta.b, function(x) ((x - mu.b) / theta)^3)
# covariance of (mu, var) -- should be around 0
expectation(theta.b, function(x) (x[1] - mu.b[1]) * (x[2] - mu.b[2]))

## ----bias-2-------------------------------------------------------------------
bias(theta.boot)
expectation(theta.mc, function(x) x - theta)
bias(theta.hat, theta)

## -----------------------------------------------------------------------------
round(vcov(theta.b), digits = 3)
round(vcov(theta.mc), digits = 3)
round(vcov(theta.hat), digits = 3)

## ----coverage-prob-boot, cache = TRUE-----------------------------------------
N <- 100
ns <- c(50, 100)
coverage_prob <- matrix(NA, nrow=length(ns), ncol=2)
j <- 1
for (n in ns) {
    count1 <- 0L
    count2 <- 0L
    for (i in 1:N) {
        d <- rnorm(n = n, mean = mu, sd = sqrt(var))
        fit.boot <- mle_boot(boot(
            data = d,
            statistic = function(x, ind) {
                params(fit_normal(x[ind]))
            },
            R = 250))
        ci <- confint(fit.boot)
        if (ci[1, 1] <= mu && mu <= ci[1, 2]) {
            count1 <- count1 + 1
        }
        if (ci[2, 1] <= var && var <= ci[2, 2]) {
            count2 <- count2 + 1
        }
    }
    coverage_prob[j, 1] <- count1 / N
    coverage_prob[j, 2] <- count2 / N
    #cat("n = ", n, ", coverage = ", coverage_prob[j, ], "\n")
    j <- j + 1
}

## ----cov-plot-boot, echo = FALSE, fig.height = 4, fig.width = 5, cache = TRUE----
plot(ns, coverage_prob[, 2], type = "l",
    xlab = "Sample size", col = "blue", ylab = "Coverage")
lines(ns, coverage_prob[, 1], col = "green")
legend("bottomright", legend = c("mu", "var"), col = c("blue", "green"), lty = 1)
abline(h = 0.95, lty = 2)

## -----------------------------------------------------------------------------
samp <- function(n, par) rnorm(n = n, mean = par[1], sd = sqrt(par[2]))
pred(x = theta.hat, samp = samp)

## -----------------------------------------------------------------------------
par <- params(theta.hat)
mu.hat <- par[1]
var.hat <- par[2]
c(mu.hat, qnorm(c(.025,.975), mean = mu.hat, sd = sqrt(var.hat)))

## -----------------------------------------------------------------------------
N <- 100
r <- 5
samp <- rnorm(N, mean = theta[1], sd = sqrt(theta[2]))
samp.sub <- matrix(samp, nrow = r)
mles.sub <- list(length = r)
for (i in 1:r)
    mles.sub[[i]] <- fit_normal(samp.sub[i,])

mle.wt <- combine(mles.sub)
mle <- fit_normal(samp)

## -----------------------------------------------------------------------------
params(mle.wt)
vcov(mle.wt)

## -----------------------------------------------------------------------------
params(mle)
vcov(mle)

## ----include = FALSE----------------------------------------------------------
options(oldopts)