Source code for tsfel.feature_extraction.features_utils

import numpy as np
import scipy
from sklearn.neighbors import KDTree



[docs]
def set_domain(key, value):
    def decorate_func(func):
        setattr(func, key, value)
        return func

    return decorate_func




[docs]
def compute_time(signal, fs):
    """Creates the signal correspondent time array.

    Parameters
    ----------
    signal: nd-array
        Input from which the time is computed.
    fs: int
        Sampling Frequency

    Returns
    -------
    time : float list
        Signal time
    """

    return np.arange(0, len(signal)) / fs




[docs]
def calc_fft(signal, fs):
    """This functions computes the fft of a signal.

    Parameters
    ----------
    signal : nd-array
        The input signal from which fft is computed
    fs : float
        Sampling frequency

    Returns
    -------
    f: nd-array
        Frequency values (xx axis)
    fmag: nd-array
        Amplitude of the frequency values (yy axis)
    """

    fmag = np.abs(np.fft.rfft(signal))
    f = np.fft.rfftfreq(len(signal), d=1 / fs)

    return f.copy(), fmag.copy()




[docs]
def filterbank(signal, fs, pre_emphasis=0.97, nfft=512, nfilt=40):
    """Computes the MEL-spaced filterbank.

    It provides the information about the power in each frequency band.

    Implementation details and description on:
    https://www.kaggle.com/ilyamich/mfcc-implementation-and-tutorial
    https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html#fnref:1

    Parameters
    ----------
    signal : nd-array
        Input from which filterbank is computed
    fs : float
        Sampling frequency
    pre_emphasis : float
        Pre-emphasis coefficient for pre-emphasis filter application
    nfft : int
        Number of points of fft
    nfilt : int
        Number of filters

    Returns
    -------
    nd-array
        MEL-spaced filterbank
    """

    # Signal is already a window from the original signal, so no frame is needed.
    # According to the references it is needed the application of a window function such as
    # hann window. However if the signal windows don't have overlap, we will lose information,
    # as the application of a hann window will overshadow the windows signal edges.

    # pre-emphasis filter to amplify the high frequencies

    emphasized_signal = np.append(
        np.array(signal)[0],
        np.array(signal[1:]) - pre_emphasis * np.array(signal[:-1]),
    )

    # Fourier transform and Power spectrum
    mag_frames = np.absolute(
        np.fft.rfft(emphasized_signal, nfft),
    )  # Magnitude of the FFT

    pow_frames = (1.0 / nfft) * (mag_frames**2)  # Power Spectrum

    low_freq_mel = 0
    high_freq_mel = 2595 * np.log10(1 + (fs / 2) / 700)  # Convert Hz to Mel
    mel_points = np.linspace(
        low_freq_mel,
        high_freq_mel,
        nfilt + 2,
    )  # Equally spaced in Mel scale
    hz_points = 700 * (10 ** (mel_points / 2595) - 1)  # Convert Mel to Hz
    filter_bin = np.floor((nfft + 1) * hz_points / fs)

    fbank = np.zeros((nfilt, int(np.floor(nfft / 2 + 1))))
    for m in np.arange(1, nfilt + 1):

        f_m_minus = int(filter_bin[m - 1])  # left
        f_m = int(filter_bin[m])  # center
        f_m_plus = int(filter_bin[m + 1])  # right

        for k in np.arange(f_m_minus, f_m):
            fbank[m - 1, k] = (k - filter_bin[m - 1]) / (filter_bin[m] - filter_bin[m - 1])
        for k in np.arange(f_m, f_m_plus):
            fbank[m - 1, k] = (filter_bin[m + 1] - k) / (filter_bin[m + 1] - filter_bin[m])

    # Area Normalization
    # If we don't normalize the noise will increase with frequency because of the filter width.
    enorm = 2.0 / (hz_points[2 : nfilt + 2] - hz_points[:nfilt])
    fbank *= enorm[:, np.newaxis]

    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks = np.where(
        filter_banks == 0,
        np.finfo(float).eps,
        filter_banks,
    )  # Numerical Stability
    filter_banks = 20 * np.log10(filter_banks)  # dB

    return filter_banks




[docs]
def autocorr_norm(signal):
    """Computes the autocorrelation.

    Implementation details and description in:
    https://ccrma.stanford.edu/~orchi/Documents/speaker_recognition_report.pdf

    Parameters
    ----------
    signal : nd-array
        Input from linear prediction coefficients are computed

    Returns
    -------
    nd-array
        Autocorrelation result
    """

    variance = np.var(signal)
    signal = np.copy(signal - signal.mean())
    r = scipy.signal.correlate(signal, signal)[-len(signal) :]

    if (signal == 0).all():
        return np.zeros(len(signal))

    acf = r / variance / len(signal)

    return acf




[docs]
def create_symmetric_matrix(acf, order=11):
    """Computes a symmetric matrix.

    Implementation details and description in:
    https://ccrma.stanford.edu/~orchi/Documents/speaker_recognition_report.pdf

    Parameters
    ----------
    acf : nd-array
        Input from which a symmetric matrix is computed
    order : int
        Order

    Returns
    -------
    nd-array
        Symmetric Matrix
    """

    smatrix = np.empty((order, order))
    xx = np.arange(order)
    j = np.tile(xx, order)
    i = np.repeat(xx, order)
    smatrix[i, j] = acf[np.abs(i - j)]

    return smatrix




[docs]
def lpc(signal, n_coeff=12):
    """Computes the linear prediction coefficients.

    Implementation details and description in:
    https://ccrma.stanford.edu/~orchi/Documents/speaker_recognition_report.pdf

    Parameters
    ----------
    signal : nd-array
        Input from linear prediction coefficients are computed
    n_coeff : int
        Number of coefficients

    Returns
    -------
    nd-array
        Linear prediction coefficients
    """

    if signal.ndim > 1:
        raise ValueError("Only 1 dimensional arrays are valid")
    if n_coeff > signal.size:
        raise ValueError("Input signal must have a length >= n_coeff")

    # Calculate the order based on the number of coefficients
    order = n_coeff - 1

    # Calculate LPC with Yule-Walker
    acf = np.correlate(signal, signal, "full")

    r = np.zeros(order + 1, "float32")
    # Assuring that works for all type of input lengths
    nx = np.min([order + 1, len(signal)])
    r[:nx] = acf[len(signal) - 1 : len(signal) + order]

    smatrix = create_symmetric_matrix(r[:-1], order)

    if np.sum(smatrix) == 0:
        return tuple(np.zeros(order + 1))

    lpc_coeffs = np.dot(np.linalg.inv(smatrix), -r[1:])

    return tuple(np.concatenate(([1.0], lpc_coeffs)))




[docs]
def create_xx(features):
    """Computes the range of features amplitude for the probability density
    function calculus.

    Parameters
    ----------
    features : nd-array
        Input features

    Returns
    -------
    nd-array
        range of features amplitude
    """

    features_ = np.copy(features)

    if max(features_) < 0:
        max_f = -max(features_)
        min_f = min(features_)
    else:
        min_f = min(features_)
        max_f = max(features_)

    if min(features_) == max(features_):
        xx = np.linspace(min_f, min_f + 10, len(features_))
    else:
        xx = np.linspace(min_f, max_f, len(features_))

    return xx




[docs]
def kde(features):
    """Computes the probability density function of the input signal using a
    Gaussian KDE (Kernel Density Estimate)

    Parameters
    ----------
    features : nd-array
        Input from which probability density function is computed

    Returns
    -------
    nd-array
        probability density values
    """
    features_ = np.copy(features)
    xx = create_xx(features_)

    if min(features_) == max(features_):
        noise = np.random.randn(len(features_)) * 0.0001
        features_ = np.copy(features_ + noise)

    kernel = scipy.stats.gaussian_kde(features_, bw_method="silverman")

    return np.array(kernel(xx) / np.sum(kernel(xx)))




[docs]
def gaussian(features):
    """Computes the probability density function of the input signal using a
    Gaussian function.

    Parameters
    ----------
    features : nd-array
        Input from which probability density function is computed
    Returns
    -------
    nd-array
        probability density values
    """

    features_ = np.copy(features)

    xx = create_xx(features_)
    std_value = np.std(features_)
    mean_value = np.mean(features_)

    if std_value == 0:
        return 0.0
    pdf_gauss = scipy.stats.norm.pdf(xx, mean_value, std_value)

    return np.array(pdf_gauss / np.sum(pdf_gauss))




[docs]
def wavelet(signal, function=scipy.signal.ricker, widths=np.arange(1, 10)):
    """Computes CWT (continuous wavelet transform) of the signal.

    Parameters
    ----------
    signal : nd-array
        Input from which CWT is computed
    function :  wavelet function
        Default: scipy.signal.ricker
    widths :  nd-array
        Widths to use for transformation
        Default: np.arange(1,10)

    Returns
    -------
    nd-array
        The result of the CWT along the time axis
        matrix with size (len(widths),len(signal))
    """

    if isinstance(function, str):
        function = eval(function)

    if isinstance(widths, str):
        widths = eval(widths)

    cwt = scipy.signal.cwt(signal, function, widths)

    return cwt




[docs]
def calc_ecdf(signal):
    """Computes the ECDF of the signal.

    Parameters
    ----------
    signal : nd-array
        Input from which ECDF is computed
    Returns
    -------
    nd-array
      Sorted signal and computed ECDF.
    """
    return np.sort(signal), np.arange(1, len(signal) + 1) / len(signal)




[docs]
def coarse_graining(signal, scale):
    """Applies a coarse-graining process to a time series: for a given scale factor, it splits
    the signal into non-overlapping windows and averages the data points.

    Parameters
    ----------
    signal : np.ndarray
        Input signal.
    scale : int
        Scale factor, determines the length of the non-overlapping windows.

    Returns
    -------
    coarsegrained_signal : np.ndarray
        Coarse-grained signal.
    """

    n = len(signal)
    windows = n // scale
    usable_n = windows * scale

    windowed_signal = np.reshape(signal[0:usable_n], (windows, scale))
    coarsegrained_signal = np.nanmean(windowed_signal, axis=1)

    return coarsegrained_signal




[docs]
def get_templates(signal, m=3):
    """Helper function for the sample entropy calculation. Divides a signal
    into templates vectors of length m.

    Parameters
    ----------
    signal : np.ndarray
        Input signal.
    m : int
        Embedding dimension that defines the length of the template vectors, defaults to 3.

    Returns
    -------
    np.ndarray
        Array of template vectors.
    """
    return np.array([signal[i : i + m] for i in np.arange(len(signal) - m + 1)])




[docs]
def sample_entropy(signal, m, tolerance):
    """Computes the sample entropy of a signal.

    Parameters
    ----------
    signal : np.ndarray
        Input signal.
    m : int
        Embedding dimension that defines the length of the template vectors, defaults to 3.
    tolerance : float
        Tolerance value, defaults to 0.2 times the standard deviation of the input signal.

    Returns
    -------
    float
        Sample Entropy of a signal.
    """
    templates_B = get_templates(signal, m)
    templates_B = templates_B[:-1]
    kdtree_B = KDTree(templates_B, metric="chebyshev")
    count_B = kdtree_B.query_radius(templates_B, tolerance, count_only=True).astype(
        np.float64,
    )
    proportion_B = np.mean((count_B - 1) / (templates_B.shape[0] - 1))

    templates_A = get_templates(signal, m + 1)
    kdtree_A = KDTree(templates_A, metric="chebyshev")
    count_A = kdtree_A.query_radius(templates_A, tolerance, count_only=True).astype(
        np.float64,
    )
    proportion_A = np.mean((count_A - 1) / (templates_A.shape[0] - 1))

    if proportion_B > 0 and proportion_A > 0:
        return -np.log(proportion_A / proportion_B)
    return np.nan




[docs]
def calc_rms(signal, window):
    """Windowed Root Mean Square (RMS) with linear detrending.

    Parameters
    ----------
    signal: nd-array
        Signal
    window: int
        Length of the window in which RMS will be calculated

    Returns
    -------
    rms : nd-array
        RMS data in each window with length len(signal)//window
    """
    num_windows = len(signal) // window
    rms = np.zeros(num_windows)

    for idx in np.arange(num_windows):
        start_idx = idx * window
        end_idx = start_idx + window
        windowed_signal = signal[start_idx:end_idx]

        coeff = np.polyfit(np.arange(window), windowed_signal, 1)
        detrended_window = windowed_signal - np.polyval(coeff, np.arange(window))
        rms[idx] = np.sqrt(np.mean(detrended_window**2))

    return rms




[docs]
def compute_rs(signal, lag):
    """Computes the average rescaled range for a window of length lag.

    Parameters
    ----------
    signal : np.ndarray
        Input signal.
    lag : int
        Window length.

    Returns
    -------
    float
        Average R/S.
    """
    n = len(signal)

    windowed_signal = np.reshape(signal[: n - n % lag], (-1, lag))
    mean_windows = np.mean(windowed_signal, axis=1)
    accumulated_windowed_signal = np.cumsum(
        windowed_signal - np.reshape(mean_windows, (-1, 1)),
        axis=1,
    )

    r = np.max(accumulated_windowed_signal, axis=1) - np.min(
        accumulated_windowed_signal,
        axis=1,
    )
    s = np.std(windowed_signal, axis=1)

    rs = np.divide(r, s)

    return np.mean(rs)




[docs]
def calc_lengths_higuchi(signal):
    """Computes the lengths for different subdivisions, using the Higuchi's
    method.

    Parameters
    ----------
    signal : np.ndarray
        Input signal.

    Returns
    -------
    lk : nd-array
        Length of curve for different subdivisions
    """
    n = len(signal)
    k_values = np.arange(1, n // 10)
    lk = []

    for k in k_values:
        lmk = 0
        for m in np.arange(1, k + 1):
            sum_length = 0
            for i in np.arange(1, (n - m) // k + 1):
                sum_length += abs(signal[m + i * k - 1] - signal[m + (i - 1) * k - 1])
            lmk += (sum_length * (n - 1)) / (((n - m) // k) * k**2)
        lk.append(lmk / k)

    return k_values, lk




[docs]
def calc_lempel_ziv_complexity(sequence):
    """Manual implementation of the Lempel-Ziv complexity.

    It is defined as the number of different substrings encountered as
    the stream is viewed from begining to the end.

    Reference:
    https://github.com/Naereen/Lempel-Ziv_Complexity/blob/master/src/lempel_ziv_complexity.py

    Parameters
    ----------
    sequence : string
        Binarised signal, as a string of characters

    Returns
    -------
        LZ index
    """

    sub_strings = set()

    ind = 0
    inc = 1
    while True:
        if ind + inc > len(sequence):
            break
        sub_str = sequence[ind : ind + inc]
        if sub_str in sub_strings:
            inc += 1
        else:
            sub_strings.add(sub_str)
            ind += inc
            inc = 1

    return len(sub_strings) / len(sequence)




[docs]
def find_plateau(y, threshold=0.1, consecutive_points=5):
    """Finds a plateau (if it exists).

    Parameters
    ----------
    y : np.ndarray
        Array of y-axis values.
    threshold : float
        Slope threshold to consider as a plateau (default is 0.1).
    consecutive_points: int
        Number of consecutive points with a small derivative to consider as a plateau (default is 5).

    Returns
    -------
        Index of the beggining of the plateau if it is found, length of y otherwise.
    """

    dy = np.diff(y)

    for i in np.arange(len(dy) - consecutive_points + 1):
        if np.all(np.abs(dy[i : i + consecutive_points]) < threshold):
            plateau_value = np.mean(y[i : i + consecutive_points])
            if plateau_value > np.mean(y):
                return i

    # No plateau found
    return len(y)