Source code for dyconnmap.ts.ci

""" Complexity Index


Complexity index (Janson2004_, Rapp2007_) computes the `l`-subword complexity (`l`-subword spectrum) of a one-dimensional,
symbolic (integer) time series, by finding the number of distinct subwords
of length `l`. The total complexity is given by the sum of all subwords of different lengths.
The unnormalized measure can be used to compare sequences of equal lengths.


Notes
-----
* This is a direct translation from the `Complexity toolbox available at` http://users.auth.gr/~stdimitr/files/software/complexitiy.rar
* Original author is Stravros Dimitriadis <stidimitriadis@gmail.com>

|

.. [Janson2004] Janson, S., Lonardi, S., & Szpankowski, W. (2004). On average sequence complexity. Theoretical Computer Science, 326(1-3), 213-227.
.. [Rapp2007] Rapp, P. E. (2007). Quantitative characterization of animal behavior following blast exposure. Cognitive neurodynamics, 1(4), 287-293.

"""
# Author: Avraam Marimpis <avraam.marimpis@gmail.com>

from typing import Tuple, Optional, Union

import numpy as np


[docs]def complexity_index(
    x: "np.ndarray[np.int32]",
    sub_len: Optional[int] = -1,
    normalize: Optional[bool] = False,
    iterations: Optional[int] = 100,
) -> Union[
    Tuple[np.float32, "np.ndarray[np.int32]"],
    Tuple[np.float32, np.float32, "np.ndarray[np.int32]"],
]:
    """ Complexity Index


    Parameters
    ----------
    x : array-like, shape(n_samples)
        Input symbolic time series.

    sub_len : int
        Maximum subword length. Default is `len(x) - 1`.

    normalize : bool
        Normalize result. Default is `False`.

    iterationss : int
        Number of iterations to perform randomization. Default is `100`.


    Returns
    -------
    normal_ci : float
        The computed omplexity index after normalization against the randomization
        procedure.

    ci : float
        The computed complexity index.

    spectrum : array-like
        A list of the number of distinct subwords of length 1, up to the size
        of the input symbolic time series.
    """
    x = x.astype(np.int32)
    x = x.flatten()
    len_x = len(x)

    ci, spectrum = __compute_complexity_index(x, sub_len)

    if normalize:
        rng = np.random.RandomState(0)

        mean_ci = 0.0
        num_letters = spectrum[0]

        for _ in range(iterations):
            new_x = np.int32(np.floor(rng.rand(len_x) * num_letters))  # type: ignore
            new_ci, _ = __compute_complexity_index(new_x, sub_len)
            mean_ci += new_ci / iterations

        normal_ci = np.float32(ci / mean_ci)

        return normal_ci, ci, spectrum

    else:
        return ci, spectrum


def __compute_complexity_index(
    x, sub_len=-1
) -> Tuple[np.float32, "np.ndarray[np.int32]"]:
    """ Complexity Index


    Parameters
    ----------
    x :
        Input symbolic time series.

    sub_len : int
        Maximum subword length. Default is `len(x) - 1`.


    Returns
    -------
    ci : float
        The computed complexity index.

    spectrum : array-like
        A list of the number of distinct subwords of length 1, up to the size
        of the input symbolic time series.
    """
    ci = 0.0

    # x = x.astype(np.int32)
    # x = x.flatten()

    len_x = len(x)

    max_subword_len = len_x - 1
    if sub_len >= 2:
        max_subword_len = sub_len

    min_x = np.min(x)
    x = x - min_x

    letters = np.unique(x)
    max_len_word = np.min([max_subword_len, len_x - 1])  # type: ignore

    spectrum = np.ones((max_len_word), dtype=np.int32)
    spectrum[0] = len(letters)

    all_num_words = list()
    for word_len in range(1, max_len_word):
        real_word_len = word_len + 1
        cumulative_words = None

        for shift in range(real_word_len):
            num_words = np.int32(np.floor((len_x - shift) / real_word_len))
            all_num_words.append(num_words)

            if num_words > 0:
                idx1 = shift
                idx2 = real_word_len * num_words + shift
                sliced = x[idx1:idx2]
                words = np.reshape(sliced, (num_words, real_word_len)).T

                if cumulative_words is None:
                    cumulative_words = words
                else:
                    cumulative_words = np.hstack([cumulative_words, words])

        conv_cumulative_words = __rowsBaseConv(cumulative_words.T)  # type: ignore
        u_cumulative_words = np.unique(conv_cumulative_words)

        spectrum[word_len] = len(u_cumulative_words)
        cumulative_words = None

    ci = np.float32(np.sum(spectrum))

    all_num_words = np.array(all_num_words).flatten()

    return ci, spectrum


def __rowsBaseConv(
    x: "np.ndarray[np.int32]", base: Optional[int] = None
) -> "np.ndarray[np.float64]":
    """

    Parameters
    ----------
    x :

    base : integer


    Returns
    -------


    """
    if base is None:
        base = np.max(x) + 1

    _, p = np.shape(x)  # type: ignore

    bases = np.ones(p) * base
    indices = list(range(p - 1, -1, -1))
    base = np.power(bases, indices)  # type: ignore

    result = x.dot(base).astype(np.float64)  # type: ignore

    return result