fastcpd.segmentation

Perform change point detection using fastcpd.

  1"""
  2Perform change point detection using fastcpd.
  3"""
  4
  5import collections
  6import numpy
  7import fastcpd.variance_estimation
  8from fastcpd.interface import fastcpd_impl
  9
 10# Families supported by the C++ Python binding (NO_RCPP mode).
 11#
 12# PELT families (fully supported): mean, variance, meanvariance, mgaussian.
 13# SEN families (OLS-initialised, no glmnet/arima): lasso.
 14#
 15# Not yet supported (require R packages at runtime):
 16#   arma, arima, ma  — need stats::arima()
 17#   garch            — needs tseries::garch() + Fortran
 18#   gaussian/lm, binomial, poisson — need glmnet / fastglm (Rcpp + Eigen)
 19_SUPPORTED_FAMILIES = frozenset({
 20    'mean', 'variance', 'meanvariance', 'mgaussian', 'lasso',
 21})
 22
 23# Map R-style alias names to the internal C++ family string.
 24_FAMILY_ALIASES = {
 25    'var': 'mgaussian',
 26}
 27
 28# Result object returned by detect() when cp_only=False.
 29# Fields:
 30#   cp_set      – list of change-point indices (1-based, matching R package)
 31#   raw_cp_set  – raw change-point indices before boundary trimming
 32#   cost_values – list of segment cost values (one per segment)
 33#   residuals   – nested list of shape (n_obs, n_response)
 34#   thetas      – nested list of shape (n_params, n_segments); column j holds
 35#                 the estimated parameters for segment j
 36CpdResult = collections.namedtuple(
 37    'CpdResult',
 38    ['cp_set', 'raw_cp_set', 'cost_values', 'residuals', 'thetas'],
 39)
 40
 41
 42def mean(data, **kwargs):
 43    """Find change points efficiently in mean change models.
 44
 45    Args:
 46        data: Univariate or multivariate data for mean change detection.
 47        **kwargs: Additional arguments passed to ``detect()``.
 48
 49    Returns:
 50        A list of change-point indices, or a CpdResult when cp_only=False.
 51    """
 52    return detect(data=data, family='mean', **kwargs)
 53
 54
 55def variance(data, **kwargs):
 56    """Find change points efficiently in variance change models.
 57
 58    Args:
 59        data: Univariate or multivariate data for variance change detection.
 60        **kwargs: Additional arguments passed to ``detect()``.
 61
 62    Returns:
 63        A list of change-point indices, or a CpdResult when cp_only=False.
 64    """
 65    return detect(data=data, family='variance', **kwargs)
 66
 67
 68def meanvariance(data, **kwargs):
 69    """Find change points efficiently in mean and/or variance change models.
 70
 71    Args:
 72        data: Univariate or multivariate data for mean and/or variance change
 73            detection.
 74        **kwargs: Additional arguments passed to ``detect()``.
 75
 76    Returns:
 77        A list of change-point indices, or a CpdResult when cp_only=False.
 78    """
 79    return detect(data=data, family='meanvariance', **kwargs)
 80
 81
 82def var(data, order, **kwargs):
 83    """Find change points efficiently in VAR (vector autoregression) models.
 84
 85    Args:
 86        data: Multivariate time series data, shape (n, q+p) where q columns
 87            are responses and p columns are lagged predictors.
 88        order: Number of lagged predictors per response (p).
 89        **kwargs: Additional arguments passed to ``detect()``.
 90
 91    Returns:
 92        A list of change-point indices, or a CpdResult when cp_only=False.
 93    """
 94    return detect(data=data, family='mgaussian', **kwargs)
 95
 96
 97def arma(data, order=(0, 0), **kwargs):
 98    """Find change points efficiently in ARMA models.
 99
100    Args:
101        data: Univariate time series data.
102        order: Tuple (p, q) for AR and MA orders.
103        **kwargs: Additional arguments passed to ``detect()``.
104
105    Returns:
106        A list of change-point indices, or a CpdResult when cp_only=False.
107    """
108    return detect(data=data, family='arma', order=order, **kwargs)
109
110
111def lasso(data, **kwargs):
112    """Find change points efficiently in LASSO regression models.
113
114    Args:
115        data: Data where the first column is the response.
116        **kwargs: Additional arguments passed to ``detect()``.
117
118    Returns:
119        A list of change-point indices, or a CpdResult when cp_only=False.
120    """
121    return detect(data=data, family='lasso', **kwargs)
122
123
124def ma(data, order=0, **kwargs):
125    """Find change points efficiently in MA (moving average) models.
126
127    Args:
128        data: Univariate time series data.
129        order: MA order q.
130        **kwargs: Additional arguments passed to ``detect()``.
131
132    Returns:
133        A list of change-point indices, or a CpdResult when cp_only=False.
134    """
135    return detect(data=data, family='ma', order=(0, order), **kwargs)
136
137
138def detect(
139    formula: str = 'y ~ . - 1',
140    data: numpy.ndarray = None,
141    beta='MBIC',
142    cost_adjustment: str = 'MBIC',
143    family: str = None,
144    cost=None,
145    cost_gradient=None,
146    cost_hessian=None,
147    line_search=(1,),
148    lower=None,
149    upper=None,
150    pruning_coef=None,
151    segment_count: int = 10,
152    trim: float = 0.05,
153    momentum_coef: float = 0.0,
154    multiple_epochs=lambda x: 0,
155    epsilon: float = 1e-10,
156    order=(0, 0, 0),
157    p: int = None,
158    p_response: int = 0,
159    variance_estimation=None,
160    cp_only: bool = False,
161    vanilla_percentage: float = 1.0,
162    warm_start: bool = False,
163    **kwargs
164):
165    r"""Find change points efficiently.
166
167    Args:
168        formula: A formula string (unused; present for API parity with R).
169        data: A NumPy array of shape (n, d) containing the data.
170        beta: Penalty criterion. One of 'BIC', 'MBIC', 'MDL', or a float.
171            The numeric value of the penalty is computed by C++ when a string
172            is supplied, using the same formulae as the R package.
173        cost_adjustment: One of 'BIC', 'MBIC', 'MDL'.
174        family: One of 'mean', 'variance', 'meanvariance', 'mgaussian',
175            'var' (alias for 'mgaussian'), 'lasso'.
176        line_search: Values for line search step sizes.
177        lower: Lower bound for parameters after each update.
178        upper: Upper bound for parameters after each update.
179        pruning_coef: Base pruning coefficient for the PELT algorithm.
180            ``None`` (default) lets C++ compute the appropriate value
181            automatically based on ``cost_adjustment`` and ``family``,
182            matching R's ``get_pruning_coef()`` behaviour.
183        segment_count: Initial guess for number of segments.
184        trim: Trimming proportion for boundary change points.
185        momentum_coef: Momentum coefficient for parameter updates.
186        epsilon: Epsilon for numerical stability.
187        order: Order for ARMA/MA models as tuple (ar_order, ma_order).
188        p: Number of model parameters.  ``None`` (or 0) triggers automatic
189            inference from ``family`` and the data dimensions in C++,
190            matching the R package's per-family formulas.
191        p_response: Number of response columns (mgaussian only).
192        variance_estimation: Pre-specified variance/covariance matrix.
193            When not supplied, estimated automatically (Rice estimator for
194            mean/mgaussian, identity otherwise).
195        cp_only: If True, return only change-point indices (list of floats).
196            If False, return a CpdResult namedtuple with cp_set, raw_cp_set,
197            cost_values, residuals, and thetas.
198        vanilla_percentage: Fraction of data to run in pure PELT mode.
199            Currently fixed at 1.0 (full PELT) for the Python binding.
200        warm_start: If True, use previous segment parameters as initial
201            values.
202
203    Returns:
204        When cp_only=True: a list of change-point indices (1-based).
205        When cp_only=False: a CpdResult namedtuple.
206    """
207    if data is None:
208        raise ValueError("data must be provided")
209    data = numpy.asarray(data, dtype=float)
210    if data.ndim == 1:
211        data = data.reshape(-1, 1)
212
213    family = family.lower() if family is not None else 'custom'
214    # Apply aliases (e.g. 'var' → 'mgaussian').
215    family = _FAMILY_ALIASES.get(family, family)
216
217    if family not in _SUPPORTED_FAMILIES:
218        raise ValueError(
219            f"Family '{family}' is not supported by the Python binding. "
220            f"Supported families: {sorted(_SUPPORTED_FAMILIES)}."
221        )
222    if cost_adjustment not in ('BIC', 'MBIC', 'MDL'):
223        raise ValueError(
224            f"cost_adjustment must be 'BIC', 'MBIC', or 'MDL', "
225            f"got {cost_adjustment!r}"
226        )
227
228    # Variance estimation (NumPy; kept in Python because it uses array ops).
229    ve = _estimate_variance(data, family, p_response, variance_estimation)
230
231    # p=None or p=0 → C++ auto-infers from family + data shape.
232    p_int = int(p) if (p is not None and p > 0) else 0
233
234    # pruning_coef=None → C++ auto-computes (NaN is the sentinel).
235    pruning_float = (
236        float('nan') if pruning_coef is None else float(pruning_coef)
237    )
238
239    result = fastcpd_impl(
240        beta,                   # str or float – C++ handles both
241        cost_adjustment,
242        bool(cp_only),
243        data.tolist(),
244        float(epsilon),
245        family,
246        list(line_search),
247        list(lower) if lower is not None else [],
248        float(momentum_coef),
249        list(order) if hasattr(order, '__len__') else [float(order)],
250        p_int,
251        int(p_response),
252        pruning_float,          # NaN → auto-compute in C++
253        int(segment_count),
254        float(trim),
255        list(upper) if upper is not None else [],
256        float(vanilla_percentage),
257        ve.tolist(),
258        bool(warm_start),
259    )
260
261    if cp_only:
262        return result['cp_set']
263
264    return CpdResult(
265        cp_set=result['cp_set'],
266        raw_cp_set=result['raw_cp_set'],
267        cost_values=result['cost_values'],
268        residuals=result['residuals'],
269        thetas=result['thetas'],
270    )
271
272
273def _estimate_variance(data, family, p_response, variance_estimation):
274    """Estimate the variance/covariance matrix for the given family."""
275    if variance_estimation is not None:
276        return numpy.asarray(variance_estimation, dtype=float)
277    if family == 'mean':
278        return fastcpd.variance_estimation.mean(data)
279    if family == 'mgaussian':
280        # Estimate Σ using Rice estimator on OLS residuals.
281        # Using raw Y differences inflates Σ when predictors have high
282        # variance, so we first partial out the predictor effect.
283        d = data.shape[1]
284        q = p_response if p_response > 0 else d
285        y_cols = data[:, :q]
286        x_cols = data[:, q:]
287        if x_cols.shape[1] > 0:
288            b_hat, _, _, _ = numpy.linalg.lstsq(x_cols, y_cols, rcond=None)
289            resid = y_cols - x_cols @ b_hat
290        else:
291            resid = y_cols - y_cols.mean(axis=0)
292        diffs = resid[1:] - resid[:-1]
293        return numpy.mean(diffs[:, :, None] * diffs[:, None, :], axis=0) / 2
294    # All other families: variance_estimate is not used by the C++ cost
295    # function, so a 1×1 identity placeholder is sufficient.
296    return numpy.eye(1)
class CpdResult(builtins.tuple):

CpdResult(cp_set, raw_cp_set, cost_values, residuals, thetas)

CpdResult(cp_set, raw_cp_set, cost_values, residuals, thetas)

Create new instance of CpdResult(cp_set, raw_cp_set, cost_values, residuals, thetas)

cp_set

Alias for field number 0

raw_cp_set

Alias for field number 1

cost_values

Alias for field number 2

residuals

Alias for field number 3

thetas

Alias for field number 4

def mean(data, **kwargs):
43def mean(data, **kwargs):
44    """Find change points efficiently in mean change models.
45
46    Args:
47        data: Univariate or multivariate data for mean change detection.
48        **kwargs: Additional arguments passed to ``detect()``.
49
50    Returns:
51        A list of change-point indices, or a CpdResult when cp_only=False.
52    """
53    return detect(data=data, family='mean', **kwargs)

Find change points efficiently in mean change models.

Arguments:
  • data: Univariate or multivariate data for mean change detection.
  • **kwargs: Additional arguments passed to detect().
Returns:

A list of change-point indices, or a CpdResult when cp_only=False.

def variance(data, **kwargs):
56def variance(data, **kwargs):
57    """Find change points efficiently in variance change models.
58
59    Args:
60        data: Univariate or multivariate data for variance change detection.
61        **kwargs: Additional arguments passed to ``detect()``.
62
63    Returns:
64        A list of change-point indices, or a CpdResult when cp_only=False.
65    """
66    return detect(data=data, family='variance', **kwargs)

Find change points efficiently in variance change models.

Arguments:
  • data: Univariate or multivariate data for variance change detection.
  • **kwargs: Additional arguments passed to detect().
Returns:

A list of change-point indices, or a CpdResult when cp_only=False.

def meanvariance(data, **kwargs):
69def meanvariance(data, **kwargs):
70    """Find change points efficiently in mean and/or variance change models.
71
72    Args:
73        data: Univariate or multivariate data for mean and/or variance change
74            detection.
75        **kwargs: Additional arguments passed to ``detect()``.
76
77    Returns:
78        A list of change-point indices, or a CpdResult when cp_only=False.
79    """
80    return detect(data=data, family='meanvariance', **kwargs)

Find change points efficiently in mean and/or variance change models.

Arguments:
  • data: Univariate or multivariate data for mean and/or variance change detection.
  • **kwargs: Additional arguments passed to detect().
Returns:

A list of change-point indices, or a CpdResult when cp_only=False.

def var(data, order, **kwargs):
83def var(data, order, **kwargs):
84    """Find change points efficiently in VAR (vector autoregression) models.
85
86    Args:
87        data: Multivariate time series data, shape (n, q+p) where q columns
88            are responses and p columns are lagged predictors.
89        order: Number of lagged predictors per response (p).
90        **kwargs: Additional arguments passed to ``detect()``.
91
92    Returns:
93        A list of change-point indices, or a CpdResult when cp_only=False.
94    """
95    return detect(data=data, family='mgaussian', **kwargs)

Find change points efficiently in VAR (vector autoregression) models.

Arguments:
  • data: Multivariate time series data, shape (n, q+p) where q columns are responses and p columns are lagged predictors.
  • order: Number of lagged predictors per response (p).
  • **kwargs: Additional arguments passed to detect().
Returns:

A list of change-point indices, or a CpdResult when cp_only=False.

def arma(data, order=(0, 0), **kwargs):
 98def arma(data, order=(0, 0), **kwargs):
 99    """Find change points efficiently in ARMA models.
100
101    Args:
102        data: Univariate time series data.
103        order: Tuple (p, q) for AR and MA orders.
104        **kwargs: Additional arguments passed to ``detect()``.
105
106    Returns:
107        A list of change-point indices, or a CpdResult when cp_only=False.
108    """
109    return detect(data=data, family='arma', order=order, **kwargs)

Find change points efficiently in ARMA models.

Arguments:
  • data: Univariate time series data.
  • order: Tuple (p, q) for AR and MA orders.
  • **kwargs: Additional arguments passed to detect().
Returns:

A list of change-point indices, or a CpdResult when cp_only=False.

def lasso(data, **kwargs):
112def lasso(data, **kwargs):
113    """Find change points efficiently in LASSO regression models.
114
115    Args:
116        data: Data where the first column is the response.
117        **kwargs: Additional arguments passed to ``detect()``.
118
119    Returns:
120        A list of change-point indices, or a CpdResult when cp_only=False.
121    """
122    return detect(data=data, family='lasso', **kwargs)

Find change points efficiently in LASSO regression models.

Arguments:
  • data: Data where the first column is the response.
  • **kwargs: Additional arguments passed to detect().
Returns:

A list of change-point indices, or a CpdResult when cp_only=False.

def ma(data, order=0, **kwargs):
125def ma(data, order=0, **kwargs):
126    """Find change points efficiently in MA (moving average) models.
127
128    Args:
129        data: Univariate time series data.
130        order: MA order q.
131        **kwargs: Additional arguments passed to ``detect()``.
132
133    Returns:
134        A list of change-point indices, or a CpdResult when cp_only=False.
135    """
136    return detect(data=data, family='ma', order=(0, order), **kwargs)

Find change points efficiently in MA (moving average) models.

Arguments:
  • data: Univariate time series data.
  • order: MA order q.
  • **kwargs: Additional arguments passed to detect().
Returns:

A list of change-point indices, or a CpdResult when cp_only=False.

def detect( formula: str = 'y ~ . - 1', data: numpy.ndarray = None, beta='MBIC', cost_adjustment: str = 'MBIC', family: str = None, cost=None, cost_gradient=None, cost_hessian=None, line_search=(1,), lower=None, upper=None, pruning_coef=None, segment_count: int = 10, trim: float = 0.05, momentum_coef: float = 0.0, multiple_epochs=<function <lambda>>, epsilon: float = 1e-10, order=(0, 0, 0), p: int = None, p_response: int = 0, variance_estimation=None, cp_only: bool = False, vanilla_percentage: float = 1.0, warm_start: bool = False, **kwargs):
139def detect(
140    formula: str = 'y ~ . - 1',
141    data: numpy.ndarray = None,
142    beta='MBIC',
143    cost_adjustment: str = 'MBIC',
144    family: str = None,
145    cost=None,
146    cost_gradient=None,
147    cost_hessian=None,
148    line_search=(1,),
149    lower=None,
150    upper=None,
151    pruning_coef=None,
152    segment_count: int = 10,
153    trim: float = 0.05,
154    momentum_coef: float = 0.0,
155    multiple_epochs=lambda x: 0,
156    epsilon: float = 1e-10,
157    order=(0, 0, 0),
158    p: int = None,
159    p_response: int = 0,
160    variance_estimation=None,
161    cp_only: bool = False,
162    vanilla_percentage: float = 1.0,
163    warm_start: bool = False,
164    **kwargs
165):
166    r"""Find change points efficiently.
167
168    Args:
169        formula: A formula string (unused; present for API parity with R).
170        data: A NumPy array of shape (n, d) containing the data.
171        beta: Penalty criterion. One of 'BIC', 'MBIC', 'MDL', or a float.
172            The numeric value of the penalty is computed by C++ when a string
173            is supplied, using the same formulae as the R package.
174        cost_adjustment: One of 'BIC', 'MBIC', 'MDL'.
175        family: One of 'mean', 'variance', 'meanvariance', 'mgaussian',
176            'var' (alias for 'mgaussian'), 'lasso'.
177        line_search: Values for line search step sizes.
178        lower: Lower bound for parameters after each update.
179        upper: Upper bound for parameters after each update.
180        pruning_coef: Base pruning coefficient for the PELT algorithm.
181            ``None`` (default) lets C++ compute the appropriate value
182            automatically based on ``cost_adjustment`` and ``family``,
183            matching R's ``get_pruning_coef()`` behaviour.
184        segment_count: Initial guess for number of segments.
185        trim: Trimming proportion for boundary change points.
186        momentum_coef: Momentum coefficient for parameter updates.
187        epsilon: Epsilon for numerical stability.
188        order: Order for ARMA/MA models as tuple (ar_order, ma_order).
189        p: Number of model parameters.  ``None`` (or 0) triggers automatic
190            inference from ``family`` and the data dimensions in C++,
191            matching the R package's per-family formulas.
192        p_response: Number of response columns (mgaussian only).
193        variance_estimation: Pre-specified variance/covariance matrix.
194            When not supplied, estimated automatically (Rice estimator for
195            mean/mgaussian, identity otherwise).
196        cp_only: If True, return only change-point indices (list of floats).
197            If False, return a CpdResult namedtuple with cp_set, raw_cp_set,
198            cost_values, residuals, and thetas.
199        vanilla_percentage: Fraction of data to run in pure PELT mode.
200            Currently fixed at 1.0 (full PELT) for the Python binding.
201        warm_start: If True, use previous segment parameters as initial
202            values.
203
204    Returns:
205        When cp_only=True: a list of change-point indices (1-based).
206        When cp_only=False: a CpdResult namedtuple.
207    """
208    if data is None:
209        raise ValueError("data must be provided")
210    data = numpy.asarray(data, dtype=float)
211    if data.ndim == 1:
212        data = data.reshape(-1, 1)
213
214    family = family.lower() if family is not None else 'custom'
215    # Apply aliases (e.g. 'var' → 'mgaussian').
216    family = _FAMILY_ALIASES.get(family, family)
217
218    if family not in _SUPPORTED_FAMILIES:
219        raise ValueError(
220            f"Family '{family}' is not supported by the Python binding. "
221            f"Supported families: {sorted(_SUPPORTED_FAMILIES)}."
222        )
223    if cost_adjustment not in ('BIC', 'MBIC', 'MDL'):
224        raise ValueError(
225            f"cost_adjustment must be 'BIC', 'MBIC', or 'MDL', "
226            f"got {cost_adjustment!r}"
227        )
228
229    # Variance estimation (NumPy; kept in Python because it uses array ops).
230    ve = _estimate_variance(data, family, p_response, variance_estimation)
231
232    # p=None or p=0 → C++ auto-infers from family + data shape.
233    p_int = int(p) if (p is not None and p > 0) else 0
234
235    # pruning_coef=None → C++ auto-computes (NaN is the sentinel).
236    pruning_float = (
237        float('nan') if pruning_coef is None else float(pruning_coef)
238    )
239
240    result = fastcpd_impl(
241        beta,                   # str or float – C++ handles both
242        cost_adjustment,
243        bool(cp_only),
244        data.tolist(),
245        float(epsilon),
246        family,
247        list(line_search),
248        list(lower) if lower is not None else [],
249        float(momentum_coef),
250        list(order) if hasattr(order, '__len__') else [float(order)],
251        p_int,
252        int(p_response),
253        pruning_float,          # NaN → auto-compute in C++
254        int(segment_count),
255        float(trim),
256        list(upper) if upper is not None else [],
257        float(vanilla_percentage),
258        ve.tolist(),
259        bool(warm_start),
260    )
261
262    if cp_only:
263        return result['cp_set']
264
265    return CpdResult(
266        cp_set=result['cp_set'],
267        raw_cp_set=result['raw_cp_set'],
268        cost_values=result['cost_values'],
269        residuals=result['residuals'],
270        thetas=result['thetas'],
271    )

Find change points efficiently.

Arguments:
  • formula: A formula string (unused; present for API parity with R).
  • data: A NumPy array of shape (n, d) containing the data.
  • beta: Penalty criterion. One of 'BIC', 'MBIC', 'MDL', or a float. The numeric value of the penalty is computed by C++ when a string is supplied, using the same formulae as the R package.
  • cost_adjustment: One of 'BIC', 'MBIC', 'MDL'.
  • family: One of 'mean', 'variance', 'meanvariance', 'mgaussian', 'var' (alias for 'mgaussian'), 'lasso'.
  • line_search: Values for line search step sizes.
  • lower: Lower bound for parameters after each update.
  • upper: Upper bound for parameters after each update.
  • pruning_coef: Base pruning coefficient for the PELT algorithm. None (default) lets C++ compute the appropriate value automatically based on cost_adjustment and family, matching R's get_pruning_coef() behaviour.
  • segment_count: Initial guess for number of segments.
  • trim: Trimming proportion for boundary change points.
  • momentum_coef: Momentum coefficient for parameter updates.
  • epsilon: Epsilon for numerical stability.
  • order: Order for ARMA/MA models as tuple (ar_order, ma_order).
  • p: Number of model parameters. None (or 0) triggers automatic inference from family and the data dimensions in C++, matching the R package's per-family formulas.
  • p_response: Number of response columns (mgaussian only).
  • variance_estimation: Pre-specified variance/covariance matrix. When not supplied, estimated automatically (Rice estimator for mean/mgaussian, identity otherwise).
  • cp_only: If True, return only change-point indices (list of floats). If False, return a CpdResult namedtuple with cp_set, raw_cp_set, cost_values, residuals, and thetas.
  • vanilla_percentage: Fraction of data to run in pure PELT mode. Currently fixed at 1.0 (full PELT) for the Python binding.
  • warm_start: If True, use previous segment parameters as initial values.
Returns:

When cp_only=True: a list of change-point indices (1-based). When cp_only=False: a CpdResult namedtuple.