fastcpd.segmentation

Perform change point detection using fastcpd.

View Source

  1"""
  2Perform change point detection using fastcpd.
  3"""
  4
  5# import pandas as pd
  6import numpy
  7# from patsy import dmatrices
  8
  9import fastcpd.variance_estimation
 10from math import log, log2
 11from fastcpd.interface import fastcpd_impl
 12
 13
 14def mean(data, **kwargs):
 15    """Find change points efficiently in mean change models.
 16
 17    Args:
 18        data: Univariate or multivariate data for mean change detection.
 19        **kwargs: Additional arguments passed to ``detect()``.
 20
 21    Returns:
 22        A ``FastCPDResult`` object.
 23    """
 24    return detect(data=data, family='mean', **kwargs)
 25
 26
 27def variance(data, **kwargs):
 28    """Find change points efficiently in variance change models.
 29
 30    Args:
 31        data: Univariate or multivariate data for variance change detection.
 32        **kwargs: Additional arguments passed to ``detect()``.
 33
 34    Returns:
 35        A ``FastCPDResult`` object.
 36    """
 37    return detect(data=data, family='variance', **kwargs)
 38
 39
 40def meanvariance(data, **kwargs):
 41    """Find change points efficiently in mean and/or variance change models.
 42
 43    Args:
 44        data: Univariate or multivariate data for mean and/or variance change
 45            detection.
 46        **kwargs: Additional arguments passed to ``detect()``.
 47
 48    Returns:
 49        A ``FastCPDResult`` object.
 50    """
 51    return detect(data=data, family='meanvariance', **kwargs)
 52
 53
 54def detect(
 55    formula: str = 'y ~ . - 1',
 56    data: numpy.ndarray = None,
 57    beta: object = 'MBIC',
 58    cost_adjustment: str = 'MBIC',
 59    family: str = None,
 60    cost=None,
 61    cost_gradient=None,
 62    cost_hessian=None,
 63    line_search=(1,),
 64    lower=None,
 65    upper=None,
 66    pruning_coef: float = None,
 67    segment_count: int = 10,
 68    trim: float = 0.05,
 69    momentum_coef: float = 0,
 70    multiple_epochs=lambda x: 0,
 71    epsilon: float = 1e-10,
 72    order=(0, 0, 0),
 73    p: int = None,
 74    variance_estimation=None,
 75    cp_only: bool = False,
 76    vanilla_percentage: float = 0,
 77    warm_start: bool = False,
 78    **kwargs
 79):
 80    r"""Find change points efficiently.
 81
 82    Args:
 83        formula: A formula string specifying the model to be fitted. The
 84            optional response variable should be on the LHS, covariates on the
 85            RHS. Intercept should be removed by appending '- 1'. By default,
 86            an intercept column is added as in R's lm().
 87        data: A NumPy array of shape (T, d) containing the data to be
 88            segmented. Each row is a data point $z_t$ in $\mathbb{R}^d$.
 89        beta: Penalty criterion for the number of change points. Can be one of
 90            'BIC', 'MBIC', 'MDL', or a float value.
 91        cost_adjustment: Cost adjustment criterion modifying the cost function.
 92            Can be one of 'BIC', 'MBIC', 'MDL', or None.
 93        family: Family of change point model. One of: 'mean', 'variance',
 94            'meanvariance', 'lm', 'binomial', 'poisson', 'lasso', 'ar',
 95            'arma', 'arima', 'garch', 'var', 'custom'. If None, it is
 96            treated as 'custom'.
 97        cost: Custom cost function, e.g., ``cost(data)`` or
 98            ``cost(data, theta)``.
 99        cost_gradient: Gradient of custom cost, e.g.,
100            ``cost_gradient(data, theta)``.
101        cost_hessian: Hessian of custom cost, e.g.,
102            ``cost_hessian(data, theta)``.
103        line_search: Values for line search step sizes.
104        lower: Lower bound for parameters after each update.
105        upper: Upper bound for parameters after each update.
106        pruning_coef: Pruning coefficient for PELT algorithm.
107        segment_count: Initial guess for number of segments.
108        trim: Trimming proportion for boundary change points.
109        momentum_coef: Momentum coefficient for parameter updates.
110        multiple_epochs: A function that takes the segment length and
111            returns an int for additional epochs.
112        epsilon: Epsilon for numerical stability.
113        order: Order for AR, VAR, ARIMA, GARCH models.
114        p: Number of covariates. If None, inferred from data.
115        variance_estimation: Pre-specified variance/covariance matrix.
116        cp_only: If True, only change points are returned.
117        vanilla_percentage: Interpolation parameter between PELT and SeGD.
118        warm_start: If True, use the previous segment's parameters as
119            initial values for the new segment.
120        **kwargs: Additional model-specific parameters.
121
122    Returns:
123        A ``FastCPDResult`` object containing change points, costs,
124        residuals, and parameter estimates.
125    """
126    family = family.lower() if family is not None else 'custom'
127    assert family in ('mean', 'variance', 'meanvariance')
128    assert cost_adjustment in ('BIC', 'MBIC', 'MDL')
129
130    if variance_estimation is not None:
131        variance_estimation = numpy.asarray(variance_estimation)
132    elif family == 'mean':
133        variance_estimation = fastcpd.variance_estimation.mean(data)
134
135    if family == 'mean':
136        p = data.shape[1]
137    elif family == 'variance':
138        p = data.shape[1] ** 2
139    elif family == 'meanvariance':
140        p = data.shape[1] + data.shape[1] ** 2
141
142    if pruning_coef is None:
143        pruning_coef = 0.0
144    if cost_adjustment == "MBIC":
145        pruning_coef += p * log(2)
146    elif cost_adjustment == "MDL":
147        pruning_coef += p * log2(2)
148
149    if isinstance(beta, str):
150        if beta == 'BIC':
151            beta = (p + 1) * log(data.shape[0]) / 2
152        elif beta == 'MBIC':
153            beta = (p + 2) * log(data.shape[0]) / 2
154        elif beta == 'MDL':
155            beta = (p + 2) * log2(data.shape[0]) / 2
156        else:
157            raise ValueError(f"Unknown beta criterion: {beta}")
158
159    result = fastcpd_impl(
160        beta,
161        cost_adjustment,
162        cp_only,
163        data,
164        epsilon,
165        family,
166        line_search,
167        [],
168        momentum_coef,
169        order,
170        p,
171        0,
172        pruning_coef,
173        segment_count,
174        trim,
175        [],
176        1.0,
177        variance_estimation,
178        warm_start,
179    )
180    return result

def mean(data, **kwargs): View Source

15def mean(data, **kwargs):
16    """Find change points efficiently in mean change models.
17
18    Args:
19        data: Univariate or multivariate data for mean change detection.
20        **kwargs: Additional arguments passed to ``detect()``.
21
22    Returns:
23        A ``FastCPDResult`` object.
24    """
25    return detect(data=data, family='mean', **kwargs)

Find change points efficiently in mean change models.

Arguments:

data: Univariate or multivariate data for mean change detection.
**kwargs: Additional arguments passed to detect().

Returns:

A FastCPDResult object.

def variance(data, **kwargs): View Source

28def variance(data, **kwargs):
29    """Find change points efficiently in variance change models.
30
31    Args:
32        data: Univariate or multivariate data for variance change detection.
33        **kwargs: Additional arguments passed to ``detect()``.
34
35    Returns:
36        A ``FastCPDResult`` object.
37    """
38    return detect(data=data, family='variance', **kwargs)

Find change points efficiently in variance change models.

Arguments:

data: Univariate or multivariate data for variance change detection.
**kwargs: Additional arguments passed to detect().

Returns:

A FastCPDResult object.

def meanvariance(data, **kwargs): View Source

41def meanvariance(data, **kwargs):
42    """Find change points efficiently in mean and/or variance change models.
43
44    Args:
45        data: Univariate or multivariate data for mean and/or variance change
46            detection.
47        **kwargs: Additional arguments passed to ``detect()``.
48
49    Returns:
50        A ``FastCPDResult`` object.
51    """
52    return detect(data=data, family='meanvariance', **kwargs)

Find change points efficiently in mean and/or variance change models.

Arguments:

data: Univariate or multivariate data for mean and/or variance change detection.
**kwargs: Additional arguments passed to detect().

Returns:

A FastCPDResult object.

def detect( formula: str = 'y ~ . - 1', data: numpy.ndarray = None, beta: object = 'MBIC', cost_adjustment: str = 'MBIC', family: str = None, cost=None, cost_gradient=None, cost_hessian=None, line_search=(1,), lower=None, upper=None, pruning_coef: float = None, segment_count: int = 10, trim: float = 0.05, momentum_coef: float = 0, multiple_epochs=<function <lambda>>, epsilon: float = 1e-10, order=(0, 0, 0), p: int = None, variance_estimation=None, cp_only: bool = False, vanilla_percentage: float = 0, warm_start: bool = False, **kwargs): View Source

 55def detect(
 56    formula: str = 'y ~ . - 1',
 57    data: numpy.ndarray = None,
 58    beta: object = 'MBIC',
 59    cost_adjustment: str = 'MBIC',
 60    family: str = None,
 61    cost=None,
 62    cost_gradient=None,
 63    cost_hessian=None,
 64    line_search=(1,),
 65    lower=None,
 66    upper=None,
 67    pruning_coef: float = None,
 68    segment_count: int = 10,
 69    trim: float = 0.05,
 70    momentum_coef: float = 0,
 71    multiple_epochs=lambda x: 0,
 72    epsilon: float = 1e-10,
 73    order=(0, 0, 0),
 74    p: int = None,
 75    variance_estimation=None,
 76    cp_only: bool = False,
 77    vanilla_percentage: float = 0,
 78    warm_start: bool = False,
 79    **kwargs
 80):
 81    r"""Find change points efficiently.
 82
 83    Args:
 84        formula: A formula string specifying the model to be fitted. The
 85            optional response variable should be on the LHS, covariates on the
 86            RHS. Intercept should be removed by appending '- 1'. By default,
 87            an intercept column is added as in R's lm().
 88        data: A NumPy array of shape (T, d) containing the data to be
 89            segmented. Each row is a data point $z_t$ in $\mathbb{R}^d$.
 90        beta: Penalty criterion for the number of change points. Can be one of
 91            'BIC', 'MBIC', 'MDL', or a float value.
 92        cost_adjustment: Cost adjustment criterion modifying the cost function.
 93            Can be one of 'BIC', 'MBIC', 'MDL', or None.
 94        family: Family of change point model. One of: 'mean', 'variance',
 95            'meanvariance', 'lm', 'binomial', 'poisson', 'lasso', 'ar',
 96            'arma', 'arima', 'garch', 'var', 'custom'. If None, it is
 97            treated as 'custom'.
 98        cost: Custom cost function, e.g., ``cost(data)`` or
 99            ``cost(data, theta)``.
100        cost_gradient: Gradient of custom cost, e.g.,
101            ``cost_gradient(data, theta)``.
102        cost_hessian: Hessian of custom cost, e.g.,
103            ``cost_hessian(data, theta)``.
104        line_search: Values for line search step sizes.
105        lower: Lower bound for parameters after each update.
106        upper: Upper bound for parameters after each update.
107        pruning_coef: Pruning coefficient for PELT algorithm.
108        segment_count: Initial guess for number of segments.
109        trim: Trimming proportion for boundary change points.
110        momentum_coef: Momentum coefficient for parameter updates.
111        multiple_epochs: A function that takes the segment length and
112            returns an int for additional epochs.
113        epsilon: Epsilon for numerical stability.
114        order: Order for AR, VAR, ARIMA, GARCH models.
115        p: Number of covariates. If None, inferred from data.
116        variance_estimation: Pre-specified variance/covariance matrix.
117        cp_only: If True, only change points are returned.
118        vanilla_percentage: Interpolation parameter between PELT and SeGD.
119        warm_start: If True, use the previous segment's parameters as
120            initial values for the new segment.
121        **kwargs: Additional model-specific parameters.
122
123    Returns:
124        A ``FastCPDResult`` object containing change points, costs,
125        residuals, and parameter estimates.
126    """
127    family = family.lower() if family is not None else 'custom'
128    assert family in ('mean', 'variance', 'meanvariance')
129    assert cost_adjustment in ('BIC', 'MBIC', 'MDL')
130
131    if variance_estimation is not None:
132        variance_estimation = numpy.asarray(variance_estimation)
133    elif family == 'mean':
134        variance_estimation = fastcpd.variance_estimation.mean(data)
135
136    if family == 'mean':
137        p = data.shape[1]
138    elif family == 'variance':
139        p = data.shape[1] ** 2
140    elif family == 'meanvariance':
141        p = data.shape[1] + data.shape[1] ** 2
142
143    if pruning_coef is None:
144        pruning_coef = 0.0
145    if cost_adjustment == "MBIC":
146        pruning_coef += p * log(2)
147    elif cost_adjustment == "MDL":
148        pruning_coef += p * log2(2)
149
150    if isinstance(beta, str):
151        if beta == 'BIC':
152            beta = (p + 1) * log(data.shape[0]) / 2
153        elif beta == 'MBIC':
154            beta = (p + 2) * log(data.shape[0]) / 2
155        elif beta == 'MDL':
156            beta = (p + 2) * log2(data.shape[0]) / 2
157        else:
158            raise ValueError(f"Unknown beta criterion: {beta}")
159
160    result = fastcpd_impl(
161        beta,
162        cost_adjustment,
163        cp_only,
164        data,
165        epsilon,
166        family,
167        line_search,
168        [],
169        momentum_coef,
170        order,
171        p,
172        0,
173        pruning_coef,
174        segment_count,
175        trim,
176        [],
177        1.0,
178        variance_estimation,
179        warm_start,
180    )
181    return result

Find change points efficiently.

Arguments:

formula: A formula string specifying the model to be fitted. The optional response variable should be on the LHS, covariates on the RHS. Intercept should be removed by appending '- 1'. By default, an intercept column is added as in R's lm().
data: A NumPy array of shape (T, d) containing the data to be segmented. Each row is a data point $z_t$ in $\mathbb{R}^d$.
beta: Penalty criterion for the number of change points. Can be one of 'BIC', 'MBIC', 'MDL', or a float value.
cost_adjustment: Cost adjustment criterion modifying the cost function. Can be one of 'BIC', 'MBIC', 'MDL', or None.
family: Family of change point model. One of: 'mean', 'variance', 'meanvariance', 'lm', 'binomial', 'poisson', 'lasso', 'ar', 'arma', 'arima', 'garch', 'var', 'custom'. If None, it is treated as 'custom'.
cost: Custom cost function, e.g., cost(data) or cost(data, theta).
cost_gradient: Gradient of custom cost, e.g., cost_gradient(data, theta).
cost_hessian: Hessian of custom cost, e.g., cost_hessian(data, theta).
line_search: Values for line search step sizes.
lower: Lower bound for parameters after each update.
upper: Upper bound for parameters after each update.
pruning_coef: Pruning coefficient for PELT algorithm.
segment_count: Initial guess for number of segments.
trim: Trimming proportion for boundary change points.
momentum_coef: Momentum coefficient for parameter updates.
multiple_epochs: A function that takes the segment length and returns an int for additional epochs.
epsilon: Epsilon for numerical stability.
order: Order for AR, VAR, ARIMA, GARCH models.
p: Number of covariates. If None, inferred from data.
variance_estimation: Pre-specified variance/covariance matrix.
cp_only: If True, only change points are returned.
vanilla_percentage: Interpolation parameter between PELT and SeGD.
warm_start: If True, use the previous segment's parameters as initial values for the new segment.
**kwargs: Additional model-specific parameters.

Returns:

A FastCPDResult object containing change points, costs, residuals, and parameter estimates.