Skip to contents

Exploration of the QMLE method for ARMA models

qmle <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(0)
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  (log(2 * pi) + log(theta[p + q + 1])) * (nrow(data) - q) / 2 +
    sum(variance_term^2) / (2 * theta[p + q + 1])
}
qmle_gradient <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(rep(1, length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  c(
    phi_coefficient[nrow(data), ] * variance_term[nrow(data)] /
      theta[p + q + 1],
    psi_coefficient[nrow(data), ] * variance_term[nrow(data)] /
      theta[p + q + 1],
    1 / 2 / theta[p + q + 1] -
      variance_term[nrow(data)]^2 / (2 * theta[p + q + 1]^2)
  )
}
qmle_gradient_sum <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(rep(1, length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  c(
    crossprod(phi_coefficient, variance_term) / theta[p + q + 1],
    crossprod(psi_coefficient, variance_term) / theta[p + q + 1],
    (nrow(data) - q) / 2 / theta[p + q + 1] -
      crossprod(variance_term) / 2 / theta[p + q + 1]^2
  )
}
qmle_hessian <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(diag(length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  phi_psi_coefficient <- array(0, c(q, p, nrow(data)))
  psi_psi_coefficient <- array(0, c(q, q, nrow(data)))
  for (i in (q + 1):nrow(data)) {
    phi_psi_coefficient[, , i] <-
      -phi_coefficient[(i - 1):(i - q), ] -
      rowSums(
        sweep(
          phi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
    psi_psi_coefficient[, , i] <-
      -psi_coefficient[(i - 1):(i - q), ] -
      t(psi_coefficient[(i - 1):(i - q), ]) -
      rowSums(
        sweep(
          psi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
  }
  hessian <- matrix(0, nrow = p + q + 1, ncol = p + q + 1)
  hessian[1:p, 1:p] <-
    crossprod(phi_coefficient[nrow(data), , drop = FALSE]) /
    theta[p + q + 1]
  hessian[1:p, (p + 1):(p + q)] <- (
    t(phi_psi_coefficient[, , nrow(data)]) * variance_term[nrow(data)] +
      crossprod(
        phi_coefficient[nrow(data), , drop = FALSE],
        psi_coefficient[nrow(data), , drop = FALSE]
      )
  ) / theta[p + q + 1]
  hessian[(p + 1):(p + q), 1:p] <- t(hessian[1:p, (p + 1):(p + q)])
  hessian[1:p, p + q + 1] <-
    -t(phi_coefficient[nrow(data), ]) *
    variance_term[nrow(data)] / theta[p + q + 1]^2
  hessian[p + q + 1, 1:p] <- t(hessian[1:p, p + q + 1])
  hessian[(p + 1):(p + q), (p + 1):(p + q)] <- (
    crossprod(psi_coefficient[nrow(data), , drop = FALSE]) +
      psi_psi_coefficient[, , nrow(data)] * variance_term[nrow(data)]
  ) / theta[p + q + 1]
  hessian[(p + 1):(p + q), p + q + 1] <-
    -t(psi_coefficient[nrow(data), ]) *
    variance_term[nrow(data)] / theta[p + q + 1]^2
  hessian[p + q + 1, (p + 1):(p + q)] <-
    t(hessian[(p + 1):(p + q), p + q + 1])
  hessian[p + q + 1, p + q + 1] <-
    variance_term[nrow(data)]^2 / theta[p + q + 1]^3 -
    1 / 2 / theta[p + q + 1]^2
  hessian
}
qmle_hessian_sum <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(diag(length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  phi_psi_coefficient <- array(0, c(q, p, nrow(data)))
  psi_psi_coefficient <- array(0, c(q, q, nrow(data)))
  for (i in (q + 1):nrow(data)) {
    phi_psi_coefficient[, , i] <-
      -phi_coefficient[(i - 1):(i - q), ] -
      rowSums(
        sweep(
          phi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
    psi_psi_coefficient[, , i] <-
      -psi_coefficient[(i - 1):(i - q), ] -
      t(psi_coefficient[(i - 1):(i - q), ]) -
      rowSums(
        sweep(
          psi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
  }
  hessian <- matrix(0, nrow = p + q + 1, ncol = p + q + 1)
  hessian[1:p, 1:p] <-
    crossprod(phi_coefficient) / theta[p + q + 1]
  hessian[(p + 1):(p + q), 1:p] <- (
    rowSums(
      sweep(
        phi_psi_coefficient,
        3,
        variance_term,
        `*`
      ),
      dims = 2
    ) +
      crossprod(
        psi_coefficient, phi_coefficient
      )
  ) / theta[p + q + 1]
  hessian[1:p, (p + 1):(p + q)] <- t(hessian[(p + 1):(p + q), 1:p])
  hessian[1:p, p + q + 1] <-
    -crossprod(phi_coefficient, variance_term) / theta[p + q + 1]^2
  hessian[p + q + 1, 1:p] <- t(hessian[1:p, p + q + 1])
  hessian[(p + 1):(p + q), (p + 1):(p + q)] <- (
    crossprod(psi_coefficient) + rowSums(
      sweep(
        psi_psi_coefficient,
        3,
        variance_term,
        `*`
      ),
      dims = 2
    )
  ) / theta[p + q + 1]
  hessian[(p + 1):(p + q), p + q + 1] <-
    -crossprod(psi_coefficient, variance_term) / theta[p + q + 1]^2
  hessian[p + q + 1, (p + 1):(p + q)] <-
    t(hessian[(p + 1):(p + q), p + q + 1])
  hessian[p + q + 1, p + q + 1] <-
    crossprod(variance_term) / theta[p + q + 1]^3 -
    (nrow(data) - q) / 2 / theta[p + q + 1]^2
  hessian
}

# fastcpd arma 1 1
set.seed(1)
n <- 600
w <- rnorm(n + 1, 0, 1)
x <- rep(0, n + 1)
for (i in 1:300) {
  x[i + 1] <- 0.1 * x[i] + w[i + 1] + 0.1 * w[i]
}
for (i in 301:n) {
  x[i + 1] <- 0.3 * x[i] + w[i + 1] + 0.4 * w[i]
}
result <- fastcpd(
  formula = ~ . - 1,
  data = data.frame(x = x[1 + seq_len(n)]),
  trim = 0,
  p = 1 + 1 + 1,
  beta = (1 + 1 + 1 + 1) * log(n) / 2,
  cost = qmle,
  cost_gradient = qmle_gradient,
  cost_hessian = qmle_hessian,
  cp_only = TRUE,
  lower = c(rep(-1, 1 + 1), 1e-10),
  upper = c(rep(1, 1 + 1), Inf),
  line_search = c(1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9)
)

# fastcpd arma 3 2
set.seed(1)
n <- 600
w <- rnorm(n + 2, 0, 1)
x <- rep(0, n + 3)
for (i in 1:300) {
  x[i + 3] <- 0.1 * x[i + 2] - 0.2 * x[i + 1] + 0.6 * x[i] +
    w[i + 2] + 0.1 * w[i + 1] + 0.5 * w[i]
}
for (i in 301:n) {
  x[i + 3] <- 0.3 * x[i + 2] + 0.4 * x[i + 1] + 0.2 * x[i] +
    w[i + 2] + 0.4 * w[i + 1] + 0.1 * w[i]
}
result <- fastcpd(
  formula = ~ . - 1,
  data = data.frame(x = x[3 + seq_len(n)]),
  trim = 0,
  p = 3 + 2 + 1,
  beta = (3 + 2 + 1 + 1) * log(n) / 2,
  cost = function(data, theta) {
    qmle(data, theta, 3, 2)
  },
  cost_gradient = function(data, theta) {
    qmle_gradient(data, theta, 3, 2)
  },
  cost_hessian = function(data, theta) {
    qmle_hessian(data, theta, 3, 2)
  },
  cp_only = TRUE,
  lower = c(rep(-1, 3 + 2), 1e-10),
  upper = c(rep(1, 3 + 2), Inf),
  line_search = c(1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9)
)
testthat::expect_equal(result@cp_set, c(4, 22, 290))

# hessian check
theta_estimate <- rep(0.1, 3 + 2 + 1)
testthat::expect_equal(
  numDeriv::hessian(
    qmle, theta_estimate, data = matrix(x[3 + seq_len(n)]), p = 3, q = 2
  ),
  qmle_hessian_sum(matrix(x[3 + seq_len(n)]), theta_estimate, 3, 2)
)

optim(
  rep(0.1, 3 + 2 + 1),
  fn = function(data, theta) {
    qmle(data, theta, 3, 2)
  },
  data = data,
  method = "L-BFGS-B",
  lower = c(rep(-1, 3 + 2), 1e-10),
  upper = c(rep(1, 3 + 2), Inf),
  gr = function(data, theta) {
    qmle_gradient_sum(data, theta, 3, 2)
  }
)

# convergence check
x <- arima.sim(list(ar = c(0.1, -0.2, 0.6), ma = c(0.1, 0.5)), n = n + 3)
theta_estimate <- rep(0.1, 3 + 2 + 1)
data <- matrix(x[3 + seq_len(n)])
qmle_path <- NULL
prev_qmle <- 1
curr_qmle <- Inf
epochs_num <- 0
while (abs(curr_qmle - prev_qmle) > 1e-5) {
  prev_qmle <- curr_qmle
  hessian <-
    Matrix::nearPD(qmle_hessian_sum(data, theta_estimate, 3, 2))$mat
  step <- solve(
    hessian, qmle_gradient_sum(data, theta_estimate, 3, 2)
  )
  # line search
  lr_choices <- c(1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9)
  lr <- lr_choices[which.min(
    sapply(lr_choices, function(lr) {
      qmle(data, pmin(
        pmax(theta_estimate - lr * step, c(rep(-1, 3 + 2), 1e-10)),
        c(rep(1, 3 + 2), Inf)
      ), 3, 2)
    })
  )]
  theta_estimate <- pmin(
    pmax(theta_estimate - lr * step, c(rep(-1, 3 + 2), 1e-10)),
    c(rep(1, 3 + 2), Inf)
  )
  curr_qmle <- qmle(data, theta_estimate, 3, 2)
  cat(epochs_num, curr_qmle, theta_estimate, "\n")
  qmle_path <- c(qmle_path, curr_qmle)
  epochs_num <- epochs_num + 1
}

Notes

The evaluation of this vignette is set to be FALSE.

Appendix: all code snippets

knitr::opts_chunk$set(collapse = TRUE, comment = "#>", eval = FALSE)
library(fastcpd)
qmle <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(0)
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  (log(2 * pi) + log(theta[p + q + 1])) * (nrow(data) - q) / 2 +
    sum(variance_term^2) / (2 * theta[p + q + 1])
}
qmle_gradient <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(rep(1, length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  c(
    phi_coefficient[nrow(data), ] * variance_term[nrow(data)] /
      theta[p + q + 1],
    psi_coefficient[nrow(data), ] * variance_term[nrow(data)] /
      theta[p + q + 1],
    1 / 2 / theta[p + q + 1] -
      variance_term[nrow(data)]^2 / (2 * theta[p + q + 1]^2)
  )
}
qmle_gradient_sum <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(rep(1, length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  c(
    crossprod(phi_coefficient, variance_term) / theta[p + q + 1],
    crossprod(psi_coefficient, variance_term) / theta[p + q + 1],
    (nrow(data) - q) / 2 / theta[p + q + 1] -
      crossprod(variance_term) / 2 / theta[p + q + 1]^2
  )
}
qmle_hessian <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(diag(length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  phi_psi_coefficient <- array(0, c(q, p, nrow(data)))
  psi_psi_coefficient <- array(0, c(q, q, nrow(data)))
  for (i in (q + 1):nrow(data)) {
    phi_psi_coefficient[, , i] <-
      -phi_coefficient[(i - 1):(i - q), ] -
      rowSums(
        sweep(
          phi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
    psi_psi_coefficient[, , i] <-
      -psi_coefficient[(i - 1):(i - q), ] -
      t(psi_coefficient[(i - 1):(i - q), ]) -
      rowSums(
        sweep(
          psi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
  }
  hessian <- matrix(0, nrow = p + q + 1, ncol = p + q + 1)
  hessian[1:p, 1:p] <-
    crossprod(phi_coefficient[nrow(data), , drop = FALSE]) /
    theta[p + q + 1]
  hessian[1:p, (p + 1):(p + q)] <- (
    t(phi_psi_coefficient[, , nrow(data)]) * variance_term[nrow(data)] +
      crossprod(
        phi_coefficient[nrow(data), , drop = FALSE],
        psi_coefficient[nrow(data), , drop = FALSE]
      )
  ) / theta[p + q + 1]
  hessian[(p + 1):(p + q), 1:p] <- t(hessian[1:p, (p + 1):(p + q)])
  hessian[1:p, p + q + 1] <-
    -t(phi_coefficient[nrow(data), ]) *
    variance_term[nrow(data)] / theta[p + q + 1]^2
  hessian[p + q + 1, 1:p] <- t(hessian[1:p, p + q + 1])
  hessian[(p + 1):(p + q), (p + 1):(p + q)] <- (
    crossprod(psi_coefficient[nrow(data), , drop = FALSE]) +
      psi_psi_coefficient[, , nrow(data)] * variance_term[nrow(data)]
  ) / theta[p + q + 1]
  hessian[(p + 1):(p + q), p + q + 1] <-
    -t(psi_coefficient[nrow(data), ]) *
    variance_term[nrow(data)] / theta[p + q + 1]^2
  hessian[p + q + 1, (p + 1):(p + q)] <-
    t(hessian[(p + 1):(p + q), p + q + 1])
  hessian[p + q + 1, p + q + 1] <-
    variance_term[nrow(data)]^2 / theta[p + q + 1]^3 -
    1 / 2 / theta[p + q + 1]^2
  hessian
}
qmle_hessian_sum <- function(data, theta, p = 1, q = 1) {
  if (nrow(data) < max(p, q) + 1) {
    return(diag(length(theta)))
  }
  variance_term <- rep(0, nrow(data))
  for (i in (max(p, q) + 1):nrow(data)) {
    variance_term[i] <-
      data[i] -
      theta[1:p] %*% data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% variance_term[(i - 1):(i - q)]
  }
  phi_coefficient <- matrix(0, nrow(data), p)
  psi_coefficient <- matrix(0, nrow(data), q)
  for (i in (max(p, q) + 1):nrow(data)) {
    phi_coefficient[i, ] <-
      -data[(i - 1):(i - p)] -
      theta[(p + 1):(p + q)] %*% phi_coefficient[(i - 1):(i - q), ]
  }
  for (i in (q + 1):nrow(data)) {
    psi_coefficient[i, ] <-
      -variance_term[(i - 1):(i - q)] -
      theta[(p + 1):(p + q)] %*% psi_coefficient[(i - 1):(i - q), ]
  }
  phi_psi_coefficient <- array(0, c(q, p, nrow(data)))
  psi_psi_coefficient <- array(0, c(q, q, nrow(data)))
  for (i in (q + 1):nrow(data)) {
    phi_psi_coefficient[, , i] <-
      -phi_coefficient[(i - 1):(i - q), ] -
      rowSums(
        sweep(
          phi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
    psi_psi_coefficient[, , i] <-
      -psi_coefficient[(i - 1):(i - q), ] -
      t(psi_coefficient[(i - 1):(i - q), ]) -
      rowSums(
        sweep(
          psi_psi_coefficient[, , (i - 1):(i - q), drop = FALSE],
          3,
          theta[(p + 1):(p + q)],
          `*`
        ),
        dims = 2
      )
  }
  hessian <- matrix(0, nrow = p + q + 1, ncol = p + q + 1)
  hessian[1:p, 1:p] <-
    crossprod(phi_coefficient) / theta[p + q + 1]
  hessian[(p + 1):(p + q), 1:p] <- (
    rowSums(
      sweep(
        phi_psi_coefficient,
        3,
        variance_term,
        `*`
      ),
      dims = 2
    ) +
      crossprod(
        psi_coefficient, phi_coefficient
      )
  ) / theta[p + q + 1]
  hessian[1:p, (p + 1):(p + q)] <- t(hessian[(p + 1):(p + q), 1:p])
  hessian[1:p, p + q + 1] <-
    -crossprod(phi_coefficient, variance_term) / theta[p + q + 1]^2
  hessian[p + q + 1, 1:p] <- t(hessian[1:p, p + q + 1])
  hessian[(p + 1):(p + q), (p + 1):(p + q)] <- (
    crossprod(psi_coefficient) + rowSums(
      sweep(
        psi_psi_coefficient,
        3,
        variance_term,
        `*`
      ),
      dims = 2
    )
  ) / theta[p + q + 1]
  hessian[(p + 1):(p + q), p + q + 1] <-
    -crossprod(psi_coefficient, variance_term) / theta[p + q + 1]^2
  hessian[p + q + 1, (p + 1):(p + q)] <-
    t(hessian[(p + 1):(p + q), p + q + 1])
  hessian[p + q + 1, p + q + 1] <-
    crossprod(variance_term) / theta[p + q + 1]^3 -
    (nrow(data) - q) / 2 / theta[p + q + 1]^2
  hessian
}

# fastcpd arma 1 1
set.seed(1)
n <- 600
w <- rnorm(n + 1, 0, 1)
x <- rep(0, n + 1)
for (i in 1:300) {
  x[i + 1] <- 0.1 * x[i] + w[i + 1] + 0.1 * w[i]
}
for (i in 301:n) {
  x[i + 1] <- 0.3 * x[i] + w[i + 1] + 0.4 * w[i]
}
result <- fastcpd(
  formula = ~ . - 1,
  data = data.frame(x = x[1 + seq_len(n)]),
  trim = 0,
  p = 1 + 1 + 1,
  beta = (1 + 1 + 1 + 1) * log(n) / 2,
  cost = qmle,
  cost_gradient = qmle_gradient,
  cost_hessian = qmle_hessian,
  cp_only = TRUE,
  lower = c(rep(-1, 1 + 1), 1e-10),
  upper = c(rep(1, 1 + 1), Inf),
  line_search = c(1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9)
)

# fastcpd arma 3 2
set.seed(1)
n <- 600
w <- rnorm(n + 2, 0, 1)
x <- rep(0, n + 3)
for (i in 1:300) {
  x[i + 3] <- 0.1 * x[i + 2] - 0.2 * x[i + 1] + 0.6 * x[i] +
    w[i + 2] + 0.1 * w[i + 1] + 0.5 * w[i]
}
for (i in 301:n) {
  x[i + 3] <- 0.3 * x[i + 2] + 0.4 * x[i + 1] + 0.2 * x[i] +
    w[i + 2] + 0.4 * w[i + 1] + 0.1 * w[i]
}
result <- fastcpd(
  formula = ~ . - 1,
  data = data.frame(x = x[3 + seq_len(n)]),
  trim = 0,
  p = 3 + 2 + 1,
  beta = (3 + 2 + 1 + 1) * log(n) / 2,
  cost = function(data, theta) {
    qmle(data, theta, 3, 2)
  },
  cost_gradient = function(data, theta) {
    qmle_gradient(data, theta, 3, 2)
  },
  cost_hessian = function(data, theta) {
    qmle_hessian(data, theta, 3, 2)
  },
  cp_only = TRUE,
  lower = c(rep(-1, 3 + 2), 1e-10),
  upper = c(rep(1, 3 + 2), Inf),
  line_search = c(1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9)
)
testthat::expect_equal(result@cp_set, c(4, 22, 290))

# hessian check
theta_estimate <- rep(0.1, 3 + 2 + 1)
testthat::expect_equal(
  numDeriv::hessian(
    qmle, theta_estimate, data = matrix(x[3 + seq_len(n)]), p = 3, q = 2
  ),
  qmle_hessian_sum(matrix(x[3 + seq_len(n)]), theta_estimate, 3, 2)
)

optim(
  rep(0.1, 3 + 2 + 1),
  fn = function(data, theta) {
    qmle(data, theta, 3, 2)
  },
  data = data,
  method = "L-BFGS-B",
  lower = c(rep(-1, 3 + 2), 1e-10),
  upper = c(rep(1, 3 + 2), Inf),
  gr = function(data, theta) {
    qmle_gradient_sum(data, theta, 3, 2)
  }
)

# convergence check
x <- arima.sim(list(ar = c(0.1, -0.2, 0.6), ma = c(0.1, 0.5)), n = n + 3)
theta_estimate <- rep(0.1, 3 + 2 + 1)
data <- matrix(x[3 + seq_len(n)])
qmle_path <- NULL
prev_qmle <- 1
curr_qmle <- Inf
epochs_num <- 0
while (abs(curr_qmle - prev_qmle) > 1e-5) {
  prev_qmle <- curr_qmle
  hessian <-
    Matrix::nearPD(qmle_hessian_sum(data, theta_estimate, 3, 2))$mat
  step <- solve(
    hessian, qmle_gradient_sum(data, theta_estimate, 3, 2)
  )
  # line search
  lr_choices <- c(1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9)
  lr <- lr_choices[which.min(
    sapply(lr_choices, function(lr) {
      qmle(data, pmin(
        pmax(theta_estimate - lr * step, c(rep(-1, 3 + 2), 1e-10)),
        c(rep(1, 3 + 2), Inf)
      ), 3, 2)
    })
  )]
  theta_estimate <- pmin(
    pmax(theta_estimate - lr * step, c(rep(-1, 3 + 2), 1e-10)),
    c(rep(1, 3 + 2), Inf)
  )
  curr_qmle <- qmle(data, theta_estimate, 3, 2)
  cat(epochs_num, curr_qmle, theta_estimate, "\n")
  qmle_path <- c(qmle_path, curr_qmle)
  epochs_num <- epochs_num + 1
}