fastcpd.segmentation
Perform change point detection using fastcpd.
1""" 2Perform change point detection using fastcpd. 3""" 4 5import collections 6import numpy 7import fastcpd.variance_estimation 8from fastcpd.interface import fastcpd_impl 9 10# Families supported by the C++ Python binding (NO_RCPP mode). 11# 12# PELT families (fully supported): mean, variance, meanvariance, mgaussian. 13# SEN families (OLS-initialised, no glmnet/arima): lasso. 14# 15# Not yet supported (require R packages at runtime): 16# arma, arima, ma — need stats::arima() 17# garch — needs tseries::garch() + Fortran 18# gaussian/lm, binomial, poisson — need glmnet / fastglm (Rcpp + Eigen) 19_SUPPORTED_FAMILIES = frozenset({ 20 'mean', 'variance', 'meanvariance', 'mgaussian', 'lasso', 21}) 22 23# Map R-style alias names to the internal C++ family string. 24_FAMILY_ALIASES = { 25 'var': 'mgaussian', 26} 27 28# Result object returned by detect() when cp_only=False. 29# Fields: 30# cp_set – list of change-point indices (1-based, matching R package) 31# raw_cp_set – raw change-point indices before boundary trimming 32# cost_values – list of segment cost values (one per segment) 33# residuals – nested list of shape (n_obs, n_response) 34# thetas – nested list of shape (n_params, n_segments); column j holds 35# the estimated parameters for segment j 36CpdResult = collections.namedtuple( 37 'CpdResult', 38 ['cp_set', 'raw_cp_set', 'cost_values', 'residuals', 'thetas'], 39) 40 41 42def mean(data, **kwargs): 43 """Find change points efficiently in mean change models. 44 45 Args: 46 data: Univariate or multivariate data for mean change detection. 47 **kwargs: Additional arguments passed to ``detect()``. 48 49 Returns: 50 A list of change-point indices, or a CpdResult when cp_only=False. 51 """ 52 return detect(data=data, family='mean', **kwargs) 53 54 55def variance(data, **kwargs): 56 """Find change points efficiently in variance change models. 57 58 Args: 59 data: Univariate or multivariate data for variance change detection. 60 **kwargs: Additional arguments passed to ``detect()``. 61 62 Returns: 63 A list of change-point indices, or a CpdResult when cp_only=False. 64 """ 65 return detect(data=data, family='variance', **kwargs) 66 67 68def meanvariance(data, **kwargs): 69 """Find change points efficiently in mean and/or variance change models. 70 71 Args: 72 data: Univariate or multivariate data for mean and/or variance change 73 detection. 74 **kwargs: Additional arguments passed to ``detect()``. 75 76 Returns: 77 A list of change-point indices, or a CpdResult when cp_only=False. 78 """ 79 return detect(data=data, family='meanvariance', **kwargs) 80 81 82def var(data, order, **kwargs): 83 """Find change points efficiently in VAR (vector autoregression) models. 84 85 Args: 86 data: Multivariate time series data, shape (n, q+p) where q columns 87 are responses and p columns are lagged predictors. 88 order: Number of lagged predictors per response (p). 89 **kwargs: Additional arguments passed to ``detect()``. 90 91 Returns: 92 A list of change-point indices, or a CpdResult when cp_only=False. 93 """ 94 return detect(data=data, family='mgaussian', **kwargs) 95 96 97def arma(data, order=(0, 0), **kwargs): 98 """Find change points efficiently in ARMA models. 99 100 Args: 101 data: Univariate time series data. 102 order: Tuple (p, q) for AR and MA orders. 103 **kwargs: Additional arguments passed to ``detect()``. 104 105 Returns: 106 A list of change-point indices, or a CpdResult when cp_only=False. 107 """ 108 return detect(data=data, family='arma', order=order, **kwargs) 109 110 111def lasso(data, **kwargs): 112 """Find change points efficiently in LASSO regression models. 113 114 Args: 115 data: Data where the first column is the response. 116 **kwargs: Additional arguments passed to ``detect()``. 117 118 Returns: 119 A list of change-point indices, or a CpdResult when cp_only=False. 120 """ 121 return detect(data=data, family='lasso', **kwargs) 122 123 124def ma(data, order=0, **kwargs): 125 """Find change points efficiently in MA (moving average) models. 126 127 Args: 128 data: Univariate time series data. 129 order: MA order q. 130 **kwargs: Additional arguments passed to ``detect()``. 131 132 Returns: 133 A list of change-point indices, or a CpdResult when cp_only=False. 134 """ 135 return detect(data=data, family='ma', order=(0, order), **kwargs) 136 137 138def detect( 139 formula: str = 'y ~ . - 1', 140 data: numpy.ndarray = None, 141 beta='MBIC', 142 cost_adjustment: str = 'MBIC', 143 family: str = None, 144 cost=None, 145 cost_gradient=None, 146 cost_hessian=None, 147 line_search=(1,), 148 lower=None, 149 upper=None, 150 pruning_coef=None, 151 segment_count: int = 10, 152 trim: float = 0.05, 153 momentum_coef: float = 0.0, 154 multiple_epochs=lambda x: 0, 155 epsilon: float = 1e-10, 156 order=(0, 0, 0), 157 p: int = None, 158 p_response: int = 0, 159 variance_estimation=None, 160 cp_only: bool = False, 161 vanilla_percentage: float = 1.0, 162 warm_start: bool = False, 163 **kwargs 164): 165 r"""Find change points efficiently. 166 167 Args: 168 formula: A formula string (unused; present for API parity with R). 169 data: A NumPy array of shape (n, d) containing the data. 170 beta: Penalty criterion. One of 'BIC', 'MBIC', 'MDL', or a float. 171 The numeric value of the penalty is computed by C++ when a string 172 is supplied, using the same formulae as the R package. 173 cost_adjustment: One of 'BIC', 'MBIC', 'MDL'. 174 family: One of 'mean', 'variance', 'meanvariance', 'mgaussian', 175 'var' (alias for 'mgaussian'), 'lasso'. 176 line_search: Values for line search step sizes. 177 lower: Lower bound for parameters after each update. 178 upper: Upper bound for parameters after each update. 179 pruning_coef: Base pruning coefficient for the PELT algorithm. 180 ``None`` (default) lets C++ compute the appropriate value 181 automatically based on ``cost_adjustment`` and ``family``, 182 matching R's ``get_pruning_coef()`` behaviour. 183 segment_count: Initial guess for number of segments. 184 trim: Trimming proportion for boundary change points. 185 momentum_coef: Momentum coefficient for parameter updates. 186 epsilon: Epsilon for numerical stability. 187 order: Order for ARMA/MA models as tuple (ar_order, ma_order). 188 p: Number of model parameters. ``None`` (or 0) triggers automatic 189 inference from ``family`` and the data dimensions in C++, 190 matching the R package's per-family formulas. 191 p_response: Number of response columns (mgaussian only). 192 variance_estimation: Pre-specified variance/covariance matrix. 193 When not supplied, estimated automatically (Rice estimator for 194 mean/mgaussian, identity otherwise). 195 cp_only: If True, return only change-point indices (list of floats). 196 If False, return a CpdResult namedtuple with cp_set, raw_cp_set, 197 cost_values, residuals, and thetas. 198 vanilla_percentage: Fraction of data to run in pure PELT mode. 199 Currently fixed at 1.0 (full PELT) for the Python binding. 200 warm_start: If True, use previous segment parameters as initial 201 values. 202 203 Returns: 204 When cp_only=True: a list of change-point indices (1-based). 205 When cp_only=False: a CpdResult namedtuple. 206 """ 207 if data is None: 208 raise ValueError("data must be provided") 209 data = numpy.asarray(data, dtype=float) 210 if data.ndim == 1: 211 data = data.reshape(-1, 1) 212 213 family = family.lower() if family is not None else 'custom' 214 # Apply aliases (e.g. 'var' → 'mgaussian'). 215 family = _FAMILY_ALIASES.get(family, family) 216 217 if family not in _SUPPORTED_FAMILIES: 218 raise ValueError( 219 f"Family '{family}' is not supported by the Python binding. " 220 f"Supported families: {sorted(_SUPPORTED_FAMILIES)}." 221 ) 222 if cost_adjustment not in ('BIC', 'MBIC', 'MDL'): 223 raise ValueError( 224 f"cost_adjustment must be 'BIC', 'MBIC', or 'MDL', " 225 f"got {cost_adjustment!r}" 226 ) 227 228 # Variance estimation (NumPy; kept in Python because it uses array ops). 229 ve = _estimate_variance(data, family, p_response, variance_estimation) 230 231 # p=None or p=0 → C++ auto-infers from family + data shape. 232 p_int = int(p) if (p is not None and p > 0) else 0 233 234 # pruning_coef=None → C++ auto-computes (NaN is the sentinel). 235 pruning_float = ( 236 float('nan') if pruning_coef is None else float(pruning_coef) 237 ) 238 239 result = fastcpd_impl( 240 beta, # str or float – C++ handles both 241 cost_adjustment, 242 bool(cp_only), 243 data.tolist(), 244 float(epsilon), 245 family, 246 list(line_search), 247 list(lower) if lower is not None else [], 248 float(momentum_coef), 249 list(order) if hasattr(order, '__len__') else [float(order)], 250 p_int, 251 int(p_response), 252 pruning_float, # NaN → auto-compute in C++ 253 int(segment_count), 254 float(trim), 255 list(upper) if upper is not None else [], 256 float(vanilla_percentage), 257 ve.tolist(), 258 bool(warm_start), 259 ) 260 261 if cp_only: 262 return result['cp_set'] 263 264 return CpdResult( 265 cp_set=result['cp_set'], 266 raw_cp_set=result['raw_cp_set'], 267 cost_values=result['cost_values'], 268 residuals=result['residuals'], 269 thetas=result['thetas'], 270 ) 271 272 273def _estimate_variance(data, family, p_response, variance_estimation): 274 """Estimate the variance/covariance matrix for the given family.""" 275 if variance_estimation is not None: 276 return numpy.asarray(variance_estimation, dtype=float) 277 if family == 'mean': 278 return fastcpd.variance_estimation.mean(data) 279 if family == 'mgaussian': 280 # Estimate Σ using Rice estimator on OLS residuals. 281 # Using raw Y differences inflates Σ when predictors have high 282 # variance, so we first partial out the predictor effect. 283 d = data.shape[1] 284 q = p_response if p_response > 0 else d 285 y_cols = data[:, :q] 286 x_cols = data[:, q:] 287 if x_cols.shape[1] > 0: 288 b_hat, _, _, _ = numpy.linalg.lstsq(x_cols, y_cols, rcond=None) 289 resid = y_cols - x_cols @ b_hat 290 else: 291 resid = y_cols - y_cols.mean(axis=0) 292 diffs = resid[1:] - resid[:-1] 293 return numpy.mean(diffs[:, :, None] * diffs[:, None, :], axis=0) / 2 294 # All other families: variance_estimate is not used by the C++ cost 295 # function, so a 1×1 identity placeholder is sufficient. 296 return numpy.eye(1)
CpdResult(cp_set, raw_cp_set, cost_values, residuals, thetas)
43def mean(data, **kwargs): 44 """Find change points efficiently in mean change models. 45 46 Args: 47 data: Univariate or multivariate data for mean change detection. 48 **kwargs: Additional arguments passed to ``detect()``. 49 50 Returns: 51 A list of change-point indices, or a CpdResult when cp_only=False. 52 """ 53 return detect(data=data, family='mean', **kwargs)
Find change points efficiently in mean change models.
Arguments:
- data: Univariate or multivariate data for mean change detection.
- **kwargs: Additional arguments passed to
detect().
Returns:
A list of change-point indices, or a CpdResult when cp_only=False.
56def variance(data, **kwargs): 57 """Find change points efficiently in variance change models. 58 59 Args: 60 data: Univariate or multivariate data for variance change detection. 61 **kwargs: Additional arguments passed to ``detect()``. 62 63 Returns: 64 A list of change-point indices, or a CpdResult when cp_only=False. 65 """ 66 return detect(data=data, family='variance', **kwargs)
Find change points efficiently in variance change models.
Arguments:
- data: Univariate or multivariate data for variance change detection.
- **kwargs: Additional arguments passed to
detect().
Returns:
A list of change-point indices, or a CpdResult when cp_only=False.
69def meanvariance(data, **kwargs): 70 """Find change points efficiently in mean and/or variance change models. 71 72 Args: 73 data: Univariate or multivariate data for mean and/or variance change 74 detection. 75 **kwargs: Additional arguments passed to ``detect()``. 76 77 Returns: 78 A list of change-point indices, or a CpdResult when cp_only=False. 79 """ 80 return detect(data=data, family='meanvariance', **kwargs)
Find change points efficiently in mean and/or variance change models.
Arguments:
- data: Univariate or multivariate data for mean and/or variance change detection.
- **kwargs: Additional arguments passed to
detect().
Returns:
A list of change-point indices, or a CpdResult when cp_only=False.
83def var(data, order, **kwargs): 84 """Find change points efficiently in VAR (vector autoregression) models. 85 86 Args: 87 data: Multivariate time series data, shape (n, q+p) where q columns 88 are responses and p columns are lagged predictors. 89 order: Number of lagged predictors per response (p). 90 **kwargs: Additional arguments passed to ``detect()``. 91 92 Returns: 93 A list of change-point indices, or a CpdResult when cp_only=False. 94 """ 95 return detect(data=data, family='mgaussian', **kwargs)
Find change points efficiently in VAR (vector autoregression) models.
Arguments:
- data: Multivariate time series data, shape (n, q+p) where q columns are responses and p columns are lagged predictors.
- order: Number of lagged predictors per response (p).
- **kwargs: Additional arguments passed to
detect().
Returns:
A list of change-point indices, or a CpdResult when cp_only=False.
98def arma(data, order=(0, 0), **kwargs): 99 """Find change points efficiently in ARMA models. 100 101 Args: 102 data: Univariate time series data. 103 order: Tuple (p, q) for AR and MA orders. 104 **kwargs: Additional arguments passed to ``detect()``. 105 106 Returns: 107 A list of change-point indices, or a CpdResult when cp_only=False. 108 """ 109 return detect(data=data, family='arma', order=order, **kwargs)
Find change points efficiently in ARMA models.
Arguments:
- data: Univariate time series data.
- order: Tuple (p, q) for AR and MA orders.
- **kwargs: Additional arguments passed to
detect().
Returns:
A list of change-point indices, or a CpdResult when cp_only=False.
112def lasso(data, **kwargs): 113 """Find change points efficiently in LASSO regression models. 114 115 Args: 116 data: Data where the first column is the response. 117 **kwargs: Additional arguments passed to ``detect()``. 118 119 Returns: 120 A list of change-point indices, or a CpdResult when cp_only=False. 121 """ 122 return detect(data=data, family='lasso', **kwargs)
Find change points efficiently in LASSO regression models.
Arguments:
- data: Data where the first column is the response.
- **kwargs: Additional arguments passed to
detect().
Returns:
A list of change-point indices, or a CpdResult when cp_only=False.
125def ma(data, order=0, **kwargs): 126 """Find change points efficiently in MA (moving average) models. 127 128 Args: 129 data: Univariate time series data. 130 order: MA order q. 131 **kwargs: Additional arguments passed to ``detect()``. 132 133 Returns: 134 A list of change-point indices, or a CpdResult when cp_only=False. 135 """ 136 return detect(data=data, family='ma', order=(0, order), **kwargs)
Find change points efficiently in MA (moving average) models.
Arguments:
- data: Univariate time series data.
- order: MA order q.
- **kwargs: Additional arguments passed to
detect().
Returns:
A list of change-point indices, or a CpdResult when cp_only=False.
139def detect( 140 formula: str = 'y ~ . - 1', 141 data: numpy.ndarray = None, 142 beta='MBIC', 143 cost_adjustment: str = 'MBIC', 144 family: str = None, 145 cost=None, 146 cost_gradient=None, 147 cost_hessian=None, 148 line_search=(1,), 149 lower=None, 150 upper=None, 151 pruning_coef=None, 152 segment_count: int = 10, 153 trim: float = 0.05, 154 momentum_coef: float = 0.0, 155 multiple_epochs=lambda x: 0, 156 epsilon: float = 1e-10, 157 order=(0, 0, 0), 158 p: int = None, 159 p_response: int = 0, 160 variance_estimation=None, 161 cp_only: bool = False, 162 vanilla_percentage: float = 1.0, 163 warm_start: bool = False, 164 **kwargs 165): 166 r"""Find change points efficiently. 167 168 Args: 169 formula: A formula string (unused; present for API parity with R). 170 data: A NumPy array of shape (n, d) containing the data. 171 beta: Penalty criterion. One of 'BIC', 'MBIC', 'MDL', or a float. 172 The numeric value of the penalty is computed by C++ when a string 173 is supplied, using the same formulae as the R package. 174 cost_adjustment: One of 'BIC', 'MBIC', 'MDL'. 175 family: One of 'mean', 'variance', 'meanvariance', 'mgaussian', 176 'var' (alias for 'mgaussian'), 'lasso'. 177 line_search: Values for line search step sizes. 178 lower: Lower bound for parameters after each update. 179 upper: Upper bound for parameters after each update. 180 pruning_coef: Base pruning coefficient for the PELT algorithm. 181 ``None`` (default) lets C++ compute the appropriate value 182 automatically based on ``cost_adjustment`` and ``family``, 183 matching R's ``get_pruning_coef()`` behaviour. 184 segment_count: Initial guess for number of segments. 185 trim: Trimming proportion for boundary change points. 186 momentum_coef: Momentum coefficient for parameter updates. 187 epsilon: Epsilon for numerical stability. 188 order: Order for ARMA/MA models as tuple (ar_order, ma_order). 189 p: Number of model parameters. ``None`` (or 0) triggers automatic 190 inference from ``family`` and the data dimensions in C++, 191 matching the R package's per-family formulas. 192 p_response: Number of response columns (mgaussian only). 193 variance_estimation: Pre-specified variance/covariance matrix. 194 When not supplied, estimated automatically (Rice estimator for 195 mean/mgaussian, identity otherwise). 196 cp_only: If True, return only change-point indices (list of floats). 197 If False, return a CpdResult namedtuple with cp_set, raw_cp_set, 198 cost_values, residuals, and thetas. 199 vanilla_percentage: Fraction of data to run in pure PELT mode. 200 Currently fixed at 1.0 (full PELT) for the Python binding. 201 warm_start: If True, use previous segment parameters as initial 202 values. 203 204 Returns: 205 When cp_only=True: a list of change-point indices (1-based). 206 When cp_only=False: a CpdResult namedtuple. 207 """ 208 if data is None: 209 raise ValueError("data must be provided") 210 data = numpy.asarray(data, dtype=float) 211 if data.ndim == 1: 212 data = data.reshape(-1, 1) 213 214 family = family.lower() if family is not None else 'custom' 215 # Apply aliases (e.g. 'var' → 'mgaussian'). 216 family = _FAMILY_ALIASES.get(family, family) 217 218 if family not in _SUPPORTED_FAMILIES: 219 raise ValueError( 220 f"Family '{family}' is not supported by the Python binding. " 221 f"Supported families: {sorted(_SUPPORTED_FAMILIES)}." 222 ) 223 if cost_adjustment not in ('BIC', 'MBIC', 'MDL'): 224 raise ValueError( 225 f"cost_adjustment must be 'BIC', 'MBIC', or 'MDL', " 226 f"got {cost_adjustment!r}" 227 ) 228 229 # Variance estimation (NumPy; kept in Python because it uses array ops). 230 ve = _estimate_variance(data, family, p_response, variance_estimation) 231 232 # p=None or p=0 → C++ auto-infers from family + data shape. 233 p_int = int(p) if (p is not None and p > 0) else 0 234 235 # pruning_coef=None → C++ auto-computes (NaN is the sentinel). 236 pruning_float = ( 237 float('nan') if pruning_coef is None else float(pruning_coef) 238 ) 239 240 result = fastcpd_impl( 241 beta, # str or float – C++ handles both 242 cost_adjustment, 243 bool(cp_only), 244 data.tolist(), 245 float(epsilon), 246 family, 247 list(line_search), 248 list(lower) if lower is not None else [], 249 float(momentum_coef), 250 list(order) if hasattr(order, '__len__') else [float(order)], 251 p_int, 252 int(p_response), 253 pruning_float, # NaN → auto-compute in C++ 254 int(segment_count), 255 float(trim), 256 list(upper) if upper is not None else [], 257 float(vanilla_percentage), 258 ve.tolist(), 259 bool(warm_start), 260 ) 261 262 if cp_only: 263 return result['cp_set'] 264 265 return CpdResult( 266 cp_set=result['cp_set'], 267 raw_cp_set=result['raw_cp_set'], 268 cost_values=result['cost_values'], 269 residuals=result['residuals'], 270 thetas=result['thetas'], 271 )
Find change points efficiently.
Arguments:
- formula: A formula string (unused; present for API parity with R).
- data: A NumPy array of shape (n, d) containing the data.
- beta: Penalty criterion. One of 'BIC', 'MBIC', 'MDL', or a float. The numeric value of the penalty is computed by C++ when a string is supplied, using the same formulae as the R package.
- cost_adjustment: One of 'BIC', 'MBIC', 'MDL'.
- family: One of 'mean', 'variance', 'meanvariance', 'mgaussian', 'var' (alias for 'mgaussian'), 'lasso'.
- line_search: Values for line search step sizes.
- lower: Lower bound for parameters after each update.
- upper: Upper bound for parameters after each update.
- pruning_coef: Base pruning coefficient for the PELT algorithm.
None(default) lets C++ compute the appropriate value automatically based oncost_adjustmentandfamily, matching R'sget_pruning_coef()behaviour. - segment_count: Initial guess for number of segments.
- trim: Trimming proportion for boundary change points.
- momentum_coef: Momentum coefficient for parameter updates.
- epsilon: Epsilon for numerical stability.
- order: Order for ARMA/MA models as tuple (ar_order, ma_order).
- p: Number of model parameters.
None(or 0) triggers automatic inference fromfamilyand the data dimensions in C++, matching the R package's per-family formulas. - p_response: Number of response columns (mgaussian only).
- variance_estimation: Pre-specified variance/covariance matrix. When not supplied, estimated automatically (Rice estimator for mean/mgaussian, identity otherwise).
- cp_only: If True, return only change-point indices (list of floats). If False, return a CpdResult namedtuple with cp_set, raw_cp_set, cost_values, residuals, and thetas.
- vanilla_percentage: Fraction of data to run in pure PELT mode. Currently fixed at 1.0 (full PELT) for the Python binding.
- warm_start: If True, use previous segment parameters as initial values.
Returns:
When cp_only=True: a list of change-point indices (1-based). When cp_only=False: a CpdResult namedtuple.