fastcpd.segmentation
Perform change point detection using fastcpd.
1""" 2Perform change point detection using fastcpd. 3""" 4 5# import pandas as pd 6import numpy 7# from patsy import dmatrices 8 9import fastcpd.variance_estimation 10from math import log, log2 11from fastcpd.interface import fastcpd_impl 12 13 14def mean(data, **kwargs): 15 """Find change points efficiently in mean change models. 16 17 Args: 18 data: Univariate or multivariate data for mean change detection. 19 **kwargs: Additional arguments passed to ``detect()``. 20 21 Returns: 22 A ``FastCPDResult`` object. 23 """ 24 return detect(data=data, family='mean', **kwargs) 25 26 27def variance(data, **kwargs): 28 """Find change points efficiently in variance change models. 29 30 Args: 31 data: Univariate or multivariate data for variance change detection. 32 **kwargs: Additional arguments passed to ``detect()``. 33 34 Returns: 35 A ``FastCPDResult`` object. 36 """ 37 return detect(data=data, family='variance', **kwargs) 38 39 40def meanvariance(data, **kwargs): 41 """Find change points efficiently in mean and/or variance change models. 42 43 Args: 44 data: Univariate or multivariate data for mean and/or variance change 45 detection. 46 **kwargs: Additional arguments passed to ``detect()``. 47 48 Returns: 49 A ``FastCPDResult`` object. 50 """ 51 return detect(data=data, family='meanvariance', **kwargs) 52 53 54def detect( 55 formula: str = 'y ~ . - 1', 56 data: numpy.ndarray = None, 57 beta: object = 'MBIC', 58 cost_adjustment: str = 'MBIC', 59 family: str = None, 60 cost=None, 61 cost_gradient=None, 62 cost_hessian=None, 63 line_search=(1,), 64 lower=None, 65 upper=None, 66 pruning_coef: float = None, 67 segment_count: int = 10, 68 trim: float = 0.05, 69 momentum_coef: float = 0, 70 multiple_epochs=lambda x: 0, 71 epsilon: float = 1e-10, 72 order=(0, 0, 0), 73 p: int = None, 74 variance_estimation=None, 75 cp_only: bool = False, 76 vanilla_percentage: float = 0, 77 warm_start: bool = False, 78 **kwargs 79): 80 r"""Find change points efficiently. 81 82 Args: 83 formula: A formula string specifying the model to be fitted. The 84 optional response variable should be on the LHS, covariates on the 85 RHS. Intercept should be removed by appending '- 1'. By default, 86 an intercept column is added as in R's lm(). 87 data: A NumPy array of shape (T, d) containing the data to be 88 segmented. Each row is a data point $z_t$ in $\mathbb{R}^d$. 89 beta: Penalty criterion for the number of change points. Can be one of 90 'BIC', 'MBIC', 'MDL', or a float value. 91 cost_adjustment: Cost adjustment criterion modifying the cost function. 92 Can be one of 'BIC', 'MBIC', 'MDL', or None. 93 family: Family of change point model. One of: 'mean', 'variance', 94 'meanvariance', 'lm', 'binomial', 'poisson', 'lasso', 'ar', 95 'arma', 'arima', 'garch', 'var', 'custom'. If None, it is 96 treated as 'custom'. 97 cost: Custom cost function, e.g., ``cost(data)`` or 98 ``cost(data, theta)``. 99 cost_gradient: Gradient of custom cost, e.g., 100 ``cost_gradient(data, theta)``. 101 cost_hessian: Hessian of custom cost, e.g., 102 ``cost_hessian(data, theta)``. 103 line_search: Values for line search step sizes. 104 lower: Lower bound for parameters after each update. 105 upper: Upper bound for parameters after each update. 106 pruning_coef: Pruning coefficient for PELT algorithm. 107 segment_count: Initial guess for number of segments. 108 trim: Trimming proportion for boundary change points. 109 momentum_coef: Momentum coefficient for parameter updates. 110 multiple_epochs: A function that takes the segment length and 111 returns an int for additional epochs. 112 epsilon: Epsilon for numerical stability. 113 order: Order for AR, VAR, ARIMA, GARCH models. 114 p: Number of covariates. If None, inferred from data. 115 variance_estimation: Pre-specified variance/covariance matrix. 116 cp_only: If True, only change points are returned. 117 vanilla_percentage: Interpolation parameter between PELT and SeGD. 118 warm_start: If True, use the previous segment's parameters as 119 initial values for the new segment. 120 **kwargs: Additional model-specific parameters. 121 122 Returns: 123 A ``FastCPDResult`` object containing change points, costs, 124 residuals, and parameter estimates. 125 """ 126 family = family.lower() if family is not None else 'custom' 127 assert family in ('mean', 'variance', 'meanvariance') 128 assert cost_adjustment in ('BIC', 'MBIC', 'MDL') 129 130 if variance_estimation is not None: 131 variance_estimation = numpy.asarray(variance_estimation) 132 elif family == 'mean': 133 variance_estimation = fastcpd.variance_estimation.mean(data) 134 135 if family == 'mean': 136 p = data.shape[1] 137 elif family == 'variance': 138 p = data.shape[1] ** 2 139 elif family == 'meanvariance': 140 p = data.shape[1] + data.shape[1] ** 2 141 142 if pruning_coef is None: 143 pruning_coef = 0.0 144 if cost_adjustment == "MBIC": 145 pruning_coef += p * log(2) 146 elif cost_adjustment == "MDL": 147 pruning_coef += p * log2(2) 148 149 if isinstance(beta, str): 150 if beta == 'BIC': 151 beta = (p + 1) * log(data.shape[0]) / 2 152 elif beta == 'MBIC': 153 beta = (p + 2) * log(data.shape[0]) / 2 154 elif beta == 'MDL': 155 beta = (p + 2) * log2(data.shape[0]) / 2 156 else: 157 raise ValueError(f"Unknown beta criterion: {beta}") 158 159 result = fastcpd_impl( 160 beta, 161 cost_adjustment, 162 cp_only, 163 data, 164 epsilon, 165 family, 166 line_search, 167 [], 168 momentum_coef, 169 order, 170 p, 171 0, 172 pruning_coef, 173 segment_count, 174 trim, 175 [], 176 1.0, 177 variance_estimation, 178 warm_start, 179 ) 180 return result
def
mean(data, **kwargs):
15def mean(data, **kwargs): 16 """Find change points efficiently in mean change models. 17 18 Args: 19 data: Univariate or multivariate data for mean change detection. 20 **kwargs: Additional arguments passed to ``detect()``. 21 22 Returns: 23 A ``FastCPDResult`` object. 24 """ 25 return detect(data=data, family='mean', **kwargs)
Find change points efficiently in mean change models.
Arguments:
- data: Univariate or multivariate data for mean change detection.
- **kwargs: Additional arguments passed to
detect()
.
Returns:
A
FastCPDResult
object.
def
variance(data, **kwargs):
28def variance(data, **kwargs): 29 """Find change points efficiently in variance change models. 30 31 Args: 32 data: Univariate or multivariate data for variance change detection. 33 **kwargs: Additional arguments passed to ``detect()``. 34 35 Returns: 36 A ``FastCPDResult`` object. 37 """ 38 return detect(data=data, family='variance', **kwargs)
Find change points efficiently in variance change models.
Arguments:
- data: Univariate or multivariate data for variance change detection.
- **kwargs: Additional arguments passed to
detect()
.
Returns:
A
FastCPDResult
object.
def
meanvariance(data, **kwargs):
41def meanvariance(data, **kwargs): 42 """Find change points efficiently in mean and/or variance change models. 43 44 Args: 45 data: Univariate or multivariate data for mean and/or variance change 46 detection. 47 **kwargs: Additional arguments passed to ``detect()``. 48 49 Returns: 50 A ``FastCPDResult`` object. 51 """ 52 return detect(data=data, family='meanvariance', **kwargs)
Find change points efficiently in mean and/or variance change models.
Arguments:
- data: Univariate or multivariate data for mean and/or variance change detection.
- **kwargs: Additional arguments passed to
detect()
.
Returns:
A
FastCPDResult
object.
def
detect( formula: str = 'y ~ . - 1', data: numpy.ndarray = None, beta: object = 'MBIC', cost_adjustment: str = 'MBIC', family: str = None, cost=None, cost_gradient=None, cost_hessian=None, line_search=(1,), lower=None, upper=None, pruning_coef: float = None, segment_count: int = 10, trim: float = 0.05, momentum_coef: float = 0, multiple_epochs=<function <lambda>>, epsilon: float = 1e-10, order=(0, 0, 0), p: int = None, variance_estimation=None, cp_only: bool = False, vanilla_percentage: float = 0, warm_start: bool = False, **kwargs):
55def detect( 56 formula: str = 'y ~ . - 1', 57 data: numpy.ndarray = None, 58 beta: object = 'MBIC', 59 cost_adjustment: str = 'MBIC', 60 family: str = None, 61 cost=None, 62 cost_gradient=None, 63 cost_hessian=None, 64 line_search=(1,), 65 lower=None, 66 upper=None, 67 pruning_coef: float = None, 68 segment_count: int = 10, 69 trim: float = 0.05, 70 momentum_coef: float = 0, 71 multiple_epochs=lambda x: 0, 72 epsilon: float = 1e-10, 73 order=(0, 0, 0), 74 p: int = None, 75 variance_estimation=None, 76 cp_only: bool = False, 77 vanilla_percentage: float = 0, 78 warm_start: bool = False, 79 **kwargs 80): 81 r"""Find change points efficiently. 82 83 Args: 84 formula: A formula string specifying the model to be fitted. The 85 optional response variable should be on the LHS, covariates on the 86 RHS. Intercept should be removed by appending '- 1'. By default, 87 an intercept column is added as in R's lm(). 88 data: A NumPy array of shape (T, d) containing the data to be 89 segmented. Each row is a data point $z_t$ in $\mathbb{R}^d$. 90 beta: Penalty criterion for the number of change points. Can be one of 91 'BIC', 'MBIC', 'MDL', or a float value. 92 cost_adjustment: Cost adjustment criterion modifying the cost function. 93 Can be one of 'BIC', 'MBIC', 'MDL', or None. 94 family: Family of change point model. One of: 'mean', 'variance', 95 'meanvariance', 'lm', 'binomial', 'poisson', 'lasso', 'ar', 96 'arma', 'arima', 'garch', 'var', 'custom'. If None, it is 97 treated as 'custom'. 98 cost: Custom cost function, e.g., ``cost(data)`` or 99 ``cost(data, theta)``. 100 cost_gradient: Gradient of custom cost, e.g., 101 ``cost_gradient(data, theta)``. 102 cost_hessian: Hessian of custom cost, e.g., 103 ``cost_hessian(data, theta)``. 104 line_search: Values for line search step sizes. 105 lower: Lower bound for parameters after each update. 106 upper: Upper bound for parameters after each update. 107 pruning_coef: Pruning coefficient for PELT algorithm. 108 segment_count: Initial guess for number of segments. 109 trim: Trimming proportion for boundary change points. 110 momentum_coef: Momentum coefficient for parameter updates. 111 multiple_epochs: A function that takes the segment length and 112 returns an int for additional epochs. 113 epsilon: Epsilon for numerical stability. 114 order: Order for AR, VAR, ARIMA, GARCH models. 115 p: Number of covariates. If None, inferred from data. 116 variance_estimation: Pre-specified variance/covariance matrix. 117 cp_only: If True, only change points are returned. 118 vanilla_percentage: Interpolation parameter between PELT and SeGD. 119 warm_start: If True, use the previous segment's parameters as 120 initial values for the new segment. 121 **kwargs: Additional model-specific parameters. 122 123 Returns: 124 A ``FastCPDResult`` object containing change points, costs, 125 residuals, and parameter estimates. 126 """ 127 family = family.lower() if family is not None else 'custom' 128 assert family in ('mean', 'variance', 'meanvariance') 129 assert cost_adjustment in ('BIC', 'MBIC', 'MDL') 130 131 if variance_estimation is not None: 132 variance_estimation = numpy.asarray(variance_estimation) 133 elif family == 'mean': 134 variance_estimation = fastcpd.variance_estimation.mean(data) 135 136 if family == 'mean': 137 p = data.shape[1] 138 elif family == 'variance': 139 p = data.shape[1] ** 2 140 elif family == 'meanvariance': 141 p = data.shape[1] + data.shape[1] ** 2 142 143 if pruning_coef is None: 144 pruning_coef = 0.0 145 if cost_adjustment == "MBIC": 146 pruning_coef += p * log(2) 147 elif cost_adjustment == "MDL": 148 pruning_coef += p * log2(2) 149 150 if isinstance(beta, str): 151 if beta == 'BIC': 152 beta = (p + 1) * log(data.shape[0]) / 2 153 elif beta == 'MBIC': 154 beta = (p + 2) * log(data.shape[0]) / 2 155 elif beta == 'MDL': 156 beta = (p + 2) * log2(data.shape[0]) / 2 157 else: 158 raise ValueError(f"Unknown beta criterion: {beta}") 159 160 result = fastcpd_impl( 161 beta, 162 cost_adjustment, 163 cp_only, 164 data, 165 epsilon, 166 family, 167 line_search, 168 [], 169 momentum_coef, 170 order, 171 p, 172 0, 173 pruning_coef, 174 segment_count, 175 trim, 176 [], 177 1.0, 178 variance_estimation, 179 warm_start, 180 ) 181 return result
Find change points efficiently.
Arguments:
- formula: A formula string specifying the model to be fitted. The optional response variable should be on the LHS, covariates on the RHS. Intercept should be removed by appending '- 1'. By default, an intercept column is added as in R's lm().
- data: A NumPy array of shape (T, d) containing the data to be segmented. Each row is a data point $z_t$ in $\mathbb{R}^d$.
- beta: Penalty criterion for the number of change points. Can be one of 'BIC', 'MBIC', 'MDL', or a float value.
- cost_adjustment: Cost adjustment criterion modifying the cost function. Can be one of 'BIC', 'MBIC', 'MDL', or None.
- family: Family of change point model. One of: 'mean', 'variance', 'meanvariance', 'lm', 'binomial', 'poisson', 'lasso', 'ar', 'arma', 'arima', 'garch', 'var', 'custom'. If None, it is treated as 'custom'.
- cost: Custom cost function, e.g.,
cost(data)
orcost(data, theta)
. - cost_gradient: Gradient of custom cost, e.g.,
cost_gradient(data, theta)
. - cost_hessian: Hessian of custom cost, e.g.,
cost_hessian(data, theta)
. - line_search: Values for line search step sizes.
- lower: Lower bound for parameters after each update.
- upper: Upper bound for parameters after each update.
- pruning_coef: Pruning coefficient for PELT algorithm.
- segment_count: Initial guess for number of segments.
- trim: Trimming proportion for boundary change points.
- momentum_coef: Momentum coefficient for parameter updates.
- multiple_epochs: A function that takes the segment length and returns an int for additional epochs.
- epsilon: Epsilon for numerical stability.
- order: Order for AR, VAR, ARIMA, GARCH models.
- p: Number of covariates. If None, inferred from data.
- variance_estimation: Pre-specified variance/covariance matrix.
- cp_only: If True, only change points are returned.
- vanilla_percentage: Interpolation parameter between PELT and SeGD.
- warm_start: If True, use the previous segment's parameters as initial values for the new segment.
- **kwargs: Additional model-specific parameters.
Returns:
A
FastCPDResult
object containing change points, costs, residuals, and parameter estimates.