linear regression with multi-dimensional responses
set.seed(1)
n <- 300
p <- 3
y_count <- 2
x <- mvtnorm::rmvnorm(n, rep(0, p), diag(p))
theta_0 <- array(NA, dim = c(3, y_count, 3))
theta_0[, , 1] <- cbind(c(1, 1.2, -1), c(-1, 0, 0.5))
theta_0[, , 2] <- cbind(c(-1, 0, 0.5), c(0.5, -0.3, 0.2))
theta_0[, , 3] <- cbind(c(0.5, -0.3, 0.2), c(1, 1.2, -1))
y <- rbind(
x[1:100, ] %*% theta_0[, , 1],
x[101:200, ] %*% theta_0[, , 2],
x[201:n, ] %*% theta_0[, , 3]
) + matrix(rnorm(n * y_count), ncol = y_count)
multi_response_linear_loss <- function(data) {
x <- data[, (ncol(data) - p + 1):ncol(data)]
y <- data[, 1:(ncol(data) - p)]
if (nrow(data) <= p) {
x_t_x <- diag(p)
} else {
x_t_x <- crossprod(x)
}
norm(y - x %*% solve(x_t_x, t(x)) %*% y, type = "F")^2 / 2
}
result <- fastcpd(
formula = y ~ x - 1,
data = data.frame(y = y, x = x),
beta = (2 * p + 1) * log(n) / 2,
cost = multi_response_linear_loss,
cp_only = TRUE
)
testthat::expect_equal(result@cp_set, c(102, 195))
var(2) model with p = 2 with innovation sd = 1
set.seed(1)
n <- 800
p <- 2
theta_1 <- matrix(c(-0.3, 0.6, -0.5, 0.4, 0.2, 0.2, 0.2, -0.2), nrow = p)
theta_2 <- matrix(c(0.3, -0.4, 0.1, -0.5, -0.5, -0.2, -0.5, 0.2), nrow = p)
x <- matrix(0, n + 2, p)
for (i in 1:500) {
x[i + 2, ] <- theta_1 %*% c(x[i + 1, ], x[i, ]) + rnorm(p, 0, 1)
}
for (i in 501:800) {
x[i + 2, ] <- theta_2 %*% c(x[i + 1, ], x[i, ]) + rnorm(p, 0, 1)
}
data <- matrix(NA, n, p * 3)
data[, seq_len(p)] <- x[p + seq_len(n), ]
for (p_i in seq_len(p)) {
data[, p + (p * p_i - 1):(p * p_i)] <- x[p - p_i + seq_len(n), ]
}
lm_x_col <- 4
multi_response_linear_loss <- function(data) {
x <- data[, (ncol(data) - lm_x_col + 1):ncol(data)]
y <- data[, 1:(ncol(data) - lm_x_col)]
if (nrow(data) <= lm_x_col + 1) {
x_t_x <- diag(lm_x_col)
} else {
x_t_x <- crossprod(x)
}
norm(y - x %*% solve(x_t_x, t(x)) %*% y, type = "F")^2 / 2
}
result_custom <- fastcpd(
formula = cbind(y.1, y.2) ~ x.1 + x.2 + x.3 + x.4 - 1,
data = data.frame(y = data[, 1:2], x = data[, 3:6]),
beta = (2 * 4 + 1) * log(n) / 2,
cost = multi_response_linear_loss
)
testthat::expect_equal(result_custom@cp_set, 517)
result_ts <- fastcpd_ts(x, "var", 2)
testthat::expect_equal(result_ts@cp_set, 519)
result_var <- fastcpd(
formula = ~ . - 1,
data = data.frame(x = x),
family = "var",
order = 2
)
testthat::expect_equal(result_var@cp_set, 519)
custom family
variance change in mean & variance change
set.seed(1)
p <- 3
data <- rbind.data.frame(
mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)),
mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)),
mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)),
mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p))
)
data_all_mean <- colMeans(data)
var_loss <- function(data) {
n <- nrow(data)
p <- ncol(data)
if (n < p) {
data_cov <- diag(p)
} else {
data_cov <- crossprod(sweep(data, 2, data_all_mean)) / (n - 1)
}
n / 2 * (log(det(data_cov)) + p * log(2 * pi) + p * (n - 1) / n)
}
var_loss_result <- fastcpd(
formula = ~ . - 1,
data = data,
beta = (p^2 + 1) * log(nrow(data)) / 2,
trim = 0.1,
p = p^2,
cost = var_loss
)
testthat::expect_equal(var_loss_result@cp_set, c(700, 1000, 1700))
huber regression
set.seed(1)
n <- 400 + 300 + 500
p <- 5
x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p))
theta <- rbind(
mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)),
mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)),
mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3))
)
theta <- cbind(theta, matrix(0, 3, 3))
theta <- theta[rep(seq_len(3), c(400, 300, 500)), ]
y_true <- rowSums(x * theta)
factor <- c(
2 * stats::rbinom(400, size = 1, prob = 0.95) - 1,
2 * stats::rbinom(300, size = 1, prob = 0.95) - 1,
2 * stats::rbinom(500, size = 1, prob = 0.95) - 1
)
y <- factor * y_true + stats::rnorm(n)
data <- cbind.data.frame(y, x)
huber_threshold <- 1
huber_loss <- function(data, theta) {
residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta
indicator <- abs(residual) <= huber_threshold
sum(
residual^2 / 2 * indicator +
huber_threshold * (
abs(residual) - huber_threshold / 2
) * (1 - indicator)
)
}
huber_loss_gradient <- function(data, theta) {
residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
if (abs(residual) <= huber_threshold) {
-residual * data[nrow(data), -1]
} else {
-huber_threshold * sign(residual) * data[nrow(data), -1]
}
}
huber_loss_hessian <- function(data, theta) {
residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
if (abs(residual) <= huber_threshold) {
outer(data[nrow(data), -1], data[nrow(data), -1])
} else {
0.01 * diag(length(theta))
}
}
huber_regression_result <- fastcpd(
formula = y ~ . - 1,
data = data,
beta = (p + 1) * log(n) / 2,
cost = huber_loss,
cost_gradient = huber_loss_gradient,
cost_hessian = huber_loss_hessian
)
testthat::expect_equal(huber_regression_result@cp_set, c(401, 726))
huber regression starting with vanilla
set.seed(1)
n <- 400 + 300 + 500
p <- 5
x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p))
theta <- rbind(
mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)),
mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)),
mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3))
)
theta <- cbind(theta, matrix(0, 3, 3))
theta <- theta[rep(seq_len(3), c(400, 300, 500)), ]
y_true <- rowSums(x * theta)
factor <- c(
2 * stats::rbinom(400, size = 1, prob = 0.95) - 1,
2 * stats::rbinom(300, size = 1, prob = 0.95) - 1,
2 * stats::rbinom(500, size = 1, prob = 0.95) - 1
)
y <- factor * y_true + stats::rnorm(n)
data <- cbind.data.frame(y, x)
huber_threshold <- 1
huber_loss <- function(data, theta) {
residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta
indicator <- abs(residual) <= huber_threshold
sum(
residual^2 / 2 * indicator +
huber_threshold * (
abs(residual) - huber_threshold / 2
) * (1 - indicator)
)
}
huber_loss_gradient <- function(data, theta) {
residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
if (abs(residual) <= huber_threshold) {
-residual * data[nrow(data), -1]
} else {
-huber_threshold * sign(residual) * data[nrow(data), -1]
}
}
huber_loss_hessian <- function(data, theta) {
residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
if (abs(residual) <= huber_threshold) {
outer(data[nrow(data), -1], data[nrow(data), -1])
} else {
0.01 * diag(length(theta))
}
}
huber_regression_result <- fastcpd(
formula = y ~ . - 1,
data = data,
beta = (p + 1) * log(n) / 2,
cost = huber_loss,
cost_gradient = huber_loss_gradient,
cost_hessian = huber_loss_hessian,
vanilla_percentage = 0.1
)
testthat::expect_equal(huber_regression_result@cp_set, c(401, 726))